From 14491cf3d116c305d2e27fe0d005f0ff06c9ff60 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 24 Mar 2016 13:43:15 -0400 Subject: [PATCH] Big reformat. --- .../include/budgetedpostingcollector.h | 398 +- advancedbenchmarking/include/maropuparser.h | 243 +- .../include/statisticsrecorder.h | 140 +- advancedbenchmarking/src/budgetedtest.cpp | 2449 +- advancedbenchmarking/src/compflatstat.cpp | 66 +- advancedbenchmarking/src/compress.cpp | 234 +- advancedbenchmarking/src/entropy.cpp | 275 +- advancedbenchmarking/src/ramtocache.cpp | 689 +- advancedbenchmarking/src/readflat.cpp | 68 +- advancedbenchmarking/src/simplesynth.cpp | 172 +- advancedbenchmarking/src/uncompress.cpp | 238 +- example.cpp | 135 +- include/VarIntG8IU.h | 390 +- include/binarypacking.h | 303 +- include/bitpacking.h | 255 +- include/bitpackinghelpers.h | 1275 +- include/boolarray.h | 335 +- include/codecfactory.h | 260 +- include/codecs.h | 199 +- include/common.h | 14 +- include/compositecodec.h | 85 +- include/delta.h | 109 +- include/deltatemplates.h | 228 +- include/fastpfor.h | 625 +- include/for.h | 578 +- include/forcodec.h | 90 +- include/frameofreference.h | 196 +- include/hybm2.h | 1153 +- include/integratedbitpacking.h | 322 +- include/intersection.h | 87 +- include/mersenne.h | 111 +- include/platform.h | 24 +- include/simdbinarypacking.h | 630 +- include/simdbitpacking.h | 68 +- include/simdbitpackinghelpers.h | 1405 +- include/simdfastpfor.h | 419 +- include/simdintegratedbitpacking.h | 953 +- include/simdvariablebyte.h | 868 +- include/skipping.h | 446 +- include/sortedbitpacking.h | 316 +- include/streamvariablebyte.h | 399 +- include/synthetic.h | 610 +- include/timer.h | 103 +- include/usimdbitpacking.h | 234 +- include/util.h | 132 +- include/variablebyte.h | 2665 +- include/varintgb.h | 1736 +- src/benchintersection.cpp | 1688 +- src/benchsearch.cpp | 794 +- src/bitpacking.cpp | 17981 +++-- src/for-gen.c | 6849 +- src/for.c | 144 +- src/frameofreference.cpp | 54571 ++++++++-------- src/integratedbitpacking.cpp | 13265 ++-- src/intersection.cpp | 1357 +- src/simdbitpacking.cpp | 23112 +++---- src/simdintegratedbitpacking.cpp | 49675 +++++++------- src/simdpackedsearch.c | 6556 +- src/simdpackedselect.c | 9958 ++- src/streamvbyte.c | 2515 +- src/testcodecs.cpp | 344 +- src/testintegration.cpp | 241 +- src/unit.cpp | 1216 +- src/usimdbitpacking.cpp | 23119 +++---- src/varintdecode.c | 4082 +- tools/clang-format.sh | 22 + 66 files changed, 121059 insertions(+), 119160 deletions(-) create mode 100755 tools/clang-format.sh diff --git a/advancedbenchmarking/include/budgetedpostingcollector.h b/advancedbenchmarking/include/budgetedpostingcollector.h index acd9a8b..fb89ddd 100644 --- a/advancedbenchmarking/include/budgetedpostingcollector.h +++ b/advancedbenchmarking/include/budgetedpostingcollector.h @@ -6,7 +6,6 @@ #ifndef SIMDCompressionAndIntersection_BUDGETEDPOSTINGCOLLECTOR_H_ #define SIMDCompressionAndIntersection_BUDGETEDPOSTINGCOLLECTOR_H_ - #include "common.h" #include "util.h" #include "maropuparser.h" @@ -14,221 +13,226 @@ using namespace SIMDCompressionLib; - /* - * This class reads postings until their total size is below a threshold (memory budget). - * It encapsulates an out-of-order posting retrieval. It is not straightforward, because + * This class reads postings until their total size is below a threshold (memory + * budget). + * It encapsulates an out-of-order posting retrieval. It is not straightforward, + * because * the posting file format lacks a dictionary. */ class BudgetedPostingCollector { public: - BudgetedPostingCollector(const string &postFileName, size_t memBudget) : - seenOffsets(0), readPostings(), - gapReader(postFileName), memBudget(memBudget), memUsed(0) { - if (!gapReader.open()) { - throw runtime_error(" could not open " + postFileName + " for reading..."); - } - readOffsets(); + BudgetedPostingCollector(const string &postFileName, size_t memBudget) + : seenOffsets(0), readPostings(), gapReader(postFileName), + memBudget(memBudget), memUsed(0) { + if (!gapReader.open()) { + throw runtime_error(" could not open " + postFileName + + " for reading..."); } - - ~BudgetedPostingCollector() { - gapReader.close(); + readOffsets(); + } + + ~BudgetedPostingCollector() { gapReader.close(); } + + /* + * checks whether the posting list index is between 0 and getMaxPostQty() + */ + bool valid(uint32_t postId) { return postId < seenOffsets.size(); } + + /* + * Returns false, when the memory budget is exhausted. + * Throws an exception in case of an error. It is assumed + * that postId is a valid identifier (call valid() to check). + */ + bool loadOnePost(uint32_t postId) { + if (readPostings.find(postId) != readPostings.end()) + return true; // already loaded, nothing to do + assert(valid(postId)); + gapReader.setPos(seenOffsets[postId]); + if (!gapReader.loadIntegers(readPostings[postId])) { + stringstream err; + err << "Cannot read posting list, id = " << postId; + throw runtime_error(err.str()); } - - /* - * checks whether the posting list index is between 0 and getMaxPostQty() - */ - bool valid(uint32_t postId) { - return postId < seenOffsets.size(); + if (!is_strictlysorted(readPostings[postId].begin(), + readPostings[postId].end())) { + stringstream err; + err << "Posting list is not in sorted order, id = " << postId; + throw runtime_error("not in sorted order"); } - - - - - /* - * Returns false, when the memory budget is exhausted. - * Throws an exception in case of an error. It is assumed - * that postId is a valid identifier (call valid() to check). - */ - bool loadOnePost(uint32_t postId) { - if (readPostings.find(postId) != readPostings.end()) - return true;// already loaded, nothing to do - assert(valid(postId)); - gapReader.setPos(seenOffsets[postId]); - if (!gapReader.loadIntegers(readPostings[postId])) { - stringstream err; - err << "Cannot read posting list, id = " << postId; - throw runtime_error(err.str()); - } - if (!is_strictlysorted(readPostings[postId].begin(), readPostings[postId].end())) { - stringstream err; - err << "Posting list is not in sorted order, id = " << postId; - throw runtime_error("not in sorted order"); - } - size_t qty = readPostings[postId].size(); - readPostings[postId].shrink_to_fit();// may or may not be useful - if (qty == 0) cout << "[WARNING] Empty posting list found." << endl; - size_t addMem = qty * sizeof(uint32_t); - if (memUsed + addMem > memBudget) { - readPostings.erase(postId); - return false; - } - memUsed += addMem; - return true; + size_t qty = readPostings[postId].size(); + readPostings[postId].shrink_to_fit(); // may or may not be useful + if (qty == 0) + cout << "[WARNING] Empty posting list found." << endl; + size_t addMem = qty * sizeof(uint32_t); + if (memUsed + addMem > memBudget) { + readPostings.erase(postId); + return false; } - /* - * Returns false, when the memory budget is exhausted. - * Throws an exception in case of an error. - * It is assume that all ideas are valid (call valid()). - * - * this is basically a convenience wrapper around loadOnePost. - */ - bool loadPostings(const vector &postIds) { - for (const auto id : postIds) { - if (!loadOnePost(id)) return false; - } - return true; + memUsed += addMem; + return true; + } + /* + * Returns false, when the memory budget is exhausted. + * Throws an exception in case of an error. + * It is assume that all ideas are valid (call valid()). + * + * this is basically a convenience wrapper around loadOnePost. + */ + bool loadPostings(const vector &postIds) { + for (const auto id : postIds) { + if (!loadOnePost(id)) + return false; } - - /* - * how many posting lists are there to be loaded? - * this is not necessarily the total number of buffered - * posting lists. - */ - size_t getMaxPostQty() const { return seenOffsets.size(); } - - - /** - * This finds the largest and smallest document ID from a set of queries. - */ - pair findDocumentIDRange(const vector> &allPostIds) { - uint32_t largestpossible = numeric_limits::max(); - uint32_t smallestpossible = 0; - pair answer = make_pair(largestpossible, smallestpossible); - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - vector &onePost = getOnePost(id); - if (onePost.empty()) continue; - if (onePost.front() < answer.first) answer.first = onePost.front(); - if (onePost.back() > answer.second) answer.second = onePost.back(); - } - } - if (answer.first > answer.second) { - cout << "[WARNING] invalid range because no data." << endl; - answer.second = answer.first; - } - return answer; + return true; + } + + /* + * how many posting lists are there to be loaded? + * this is not necessarily the total number of buffered + * posting lists. + */ + size_t getMaxPostQty() const { return seenOffsets.size(); } + + /** + * This finds the largest and smallest document ID from a set of queries. + */ + pair + findDocumentIDRange(const vector> &allPostIds) { + uint32_t largestpossible = numeric_limits::max(); + uint32_t smallestpossible = 0; + pair answer = + make_pair(largestpossible, smallestpossible); + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + vector &onePost = getOnePost(id); + if (onePost.empty()) + continue; + if (onePost.front() < answer.first) + answer.first = onePost.front(); + if (onePost.back() > answer.second) + answer.second = onePost.back(); + } } - - /** - * Given a set of queries, this will find the size of the largest - * posting list corresponding to one of the IDs being queried. - * Before calling this function, the data should have be - * pre-loaded using the function loadOnePost or loadPostings; - * an exception is thrown otherwise. - */ - size_t findMaxPostingSize(const vector> &allPostIds) { - size_t MaxPostingSize(0); - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - vector &onePost = getOnePost(id); - if (MaxPostingSize < onePost.size()) MaxPostingSize = onePost.size(); - } - } - return MaxPostingSize; + if (answer.first > answer.second) { + cout << "[WARNING] invalid range because no data." << endl; + answer.second = answer.first; } - /* - * Before getOnePost are called, the data should be - * pre-loaded using the function loadOnePost or loadPostings; - * an exception is thrown otherwise. - */ - vector &getOnePost(uint32_t postId) { - if (readPostings.find(postId) == readPostings.end()) { - throw runtime_error("Should call loadIntegers before can access postings!"); - } - return readPostings[postId]; + return answer; + } + + /** + * Given a set of queries, this will find the size of the largest + * posting list corresponding to one of the IDs being queried. + * Before calling this function, the data should have be + * pre-loaded using the function loadOnePost or loadPostings; + * an exception is thrown otherwise. + */ + size_t findMaxPostingSize(const vector> &allPostIds) { + size_t MaxPostingSize(0); + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + vector &onePost = getOnePost(id); + if (MaxPostingSize < onePost.size()) + MaxPostingSize = onePost.size(); + } } - - size_t getSizeInInts(uint32_t postId) { - return getOnePost(postId).size(); + return MaxPostingSize; + } + /* + * Before getOnePost are called, the data should be + * pre-loaded using the function loadOnePost or loadPostings; + * an exception is thrown otherwise. + */ + vector &getOnePost(uint32_t postId) { + if (readPostings.find(postId) == readPostings.end()) { + throw runtime_error( + "Should call loadIntegers before can access postings!"); } - - /** - * Flushes all posting lists, recovering the memory. - */ - void clear() { - readPostings.clear(); - memUsed = 0; + return readPostings[postId]; + } + + size_t getSizeInInts(uint32_t postId) { return getOnePost(postId).size(); } + + /** + * Flushes all posting lists, recovering the memory. + */ + void clear() { + readPostings.clear(); + memUsed = 0; + } + + /** + * Given a set of posting IDs, compute the corresponding intersection. + * This is *specifically* not meant to be fast. Do not use if you need + * good speed. (It *may* be fast... but it is not the design goal.) + * + * The data should be pre-loaded using the function loadOnePost or + * loadPostings; + * an exception is thrown otherwise. + */ + vector computeIntersection(const vector &qids, + intersectionfunction Inter) { + vector inter; + if (qids.empty()) + return inter; + vector> sizeids; + for (uint32_t i : qids) { + sizeids.emplace_back(make_pair(getOnePost(i).size(), i)); } - - /** - * Given a set of posting IDs, compute the corresponding intersection. - * This is *specifically* not meant to be fast. Do not use if you need - * good speed. (It *may* be fast... but it is not the design goal.) - * - * The data should be pre-loaded using the function loadOnePost or loadPostings; - * an exception is thrown otherwise. - */ - vector computeIntersection(const vector &qids, intersectionfunction Inter) { - vector inter; - if (qids.empty()) return inter; - vector> sizeids; - for (uint32_t i : qids) { - sizeids.emplace_back(make_pair(getOnePost(i).size(), i)); - } - sort(sizeids.begin(), sizeids.end()); - inter = getOnePost(sizeids.front().second); - size_t intersize = inter.size(); - for (size_t k = 1; k < qids.size() ; ++k) { - intersize = Inter(inter.data(), intersize, - getOnePost(sizeids[k].second).data(), getOnePost(sizeids[k].second).size(), inter.data()); - } - inter.resize(intersize); - return inter; + sort(sizeids.begin(), sizeids.end()); + inter = getOnePost(sizeids.front().second); + size_t intersize = inter.size(); + for (size_t k = 1; k < qids.size(); ++k) { + intersize = + Inter(inter.data(), intersize, getOnePost(sizeids[k].second).data(), + getOnePost(sizeids[k].second).size(), inter.data()); } - - size_t getMemUsed() const { return memUsed; } - - /** - * this function is called by the constructor. It recovers all - * of the location of the posting lists. So if you have 1000,000 posting lists, - * this could amount to sizeof(off_t) * 1000000 bytes. So maybe 8MB. - * - * Note that we don't normally keep all of the posting lists themselves in RAM. - */ - void readOffsets() { - seenOffsets.clear(); - off_t pos; - uint32_t qty; - while (true) { - if (!gapReader.readNextPosAndQty(pos, qty)) { - return; - } - seenOffsets.emplace_back(pos); - } + inter.resize(intersize); + return inter; + } + + size_t getMemUsed() const { return memUsed; } + + /** + * this function is called by the constructor. It recovers all + * of the location of the posting lists. So if you have 1000,000 posting lists, + * this could amount to sizeof(off_t) * 1000000 bytes. So maybe 8MB. + * + * Note that we don't normally keep all of the posting lists themselves in RAM. + */ + void readOffsets() { + seenOffsets.clear(); + off_t pos; + uint32_t qty; + while (true) { + if (!gapReader.readNextPosAndQty(pos, qty)) { + return; + } + seenOffsets.emplace_back(pos); } - + } private: - // we make the copy constructor private to prevent the compiler from autogenerating one - BudgetedPostingCollector(const BudgetedPostingCollector &); - // we don't want people to use the assignment operator. - BudgetedPostingCollector &operator=(const BudgetedPostingCollector &); - - /* - * The class "knows" all offsets for posting ids from 0 to seenOffsets.size() - 1 - * Yet, some of the posting pointers can be null. For example, we start - * with an empty posting list collector and get a request to read posting with 10. - */ - vector seenOffsets; // location of the postings - unordered_map> readPostings; // accumulated postings - - MaropuGapReader gapReader; // reader to recover the postings - size_t memBudget; - size_t memUsed; + // we make the copy constructor private to prevent the compiler from + // autogenerating one + BudgetedPostingCollector(const BudgetedPostingCollector &); + // we don't want people to use the assignment operator. + BudgetedPostingCollector &operator=(const BudgetedPostingCollector &); + + /* + * The class "knows" all offsets for posting ids from 0 to seenOffsets.size() + * - 1 + * Yet, some of the posting pointers can be null. For example, we start + * with an empty posting list collector and get a request to read posting with + * 10. + */ + vector seenOffsets; // location of the postings + unordered_map> readPostings; // accumulated postings + + MaropuGapReader gapReader; // reader to recover the postings + size_t memBudget; + size_t memUsed; }; - - - #endif /* SIMDCompressionAndIntersection_BUDGETEDPOSTINGCOLLECTOR_H_ */ diff --git a/advancedbenchmarking/include/maropuparser.h b/advancedbenchmarking/include/maropuparser.h index 7caba43..eeca62b 100644 --- a/advancedbenchmarking/include/maropuparser.h +++ b/advancedbenchmarking/include/maropuparser.h @@ -29,140 +29,137 @@ using namespace std; */ class MaropuGapReader { public: - MaropuGapReader(const string &filename) : - mFilename(filename), fd(NULL) { + MaropuGapReader(const string &filename) : mFilename(filename), fd(NULL) {} + + /** + * The copy constructor will assign the same file name, + * but the newly constructed object won't be opened. + */ + MaropuGapReader(const MaropuGapReader &mgr) + : mFilename(mgr.mFilename), fd(NULL) {} + + /** + * Assignment will close the current reader, and change + * the file name. You need to reopen the reader after the assignment. + */ + MaropuGapReader &operator=(const MaropuGapReader &mgr) { + close(); + mFilename = mgr.mFilename; + return *this; + } + + ~MaropuGapReader() { close(); } + + // @daniel: should we worry about our code being compilable on 32-bit + // machines? + // if so, we need to add -D_FILE_OFFSET_BITS=64 to the makefile + // Daniel: it would seem odd to consider 32-bit machines when we assume AVX + // support! + off_t getPos() { + errno = 0; + off_t res = ftello(fd); + if (res < 0) { + stringstream err; + err << "Error getting file position, IO status: " << strerror(errno); + throw runtime_error(err.str()); } - - - /** - * The copy constructor will assign the same file name, - * but the newly constructed object won't be opened. - */ - MaropuGapReader(const MaropuGapReader &mgr) : - mFilename(mgr.mFilename), fd(NULL) { - } - - /** - * Assignment will close the current reader, and change - * the file name. You need to reopen the reader after the assignment. - */ - MaropuGapReader &operator=(const MaropuGapReader &mgr) { - close(); - mFilename = mgr.mFilename; - return *this; + return res; + } + + void setPos(off_t pos) { + errno = 0; + off_t res = fseeko(fd, pos, SEEK_SET); + if (res < 0) { + stringstream err; + err << "Error setting file position, IO status: " << strerror(errno); + throw runtime_error(err.str()); } - - ~MaropuGapReader() { - close(); - } - - // @daniel: should we worry about our code being compilable on 32-bit machines? - // if so, we need to add -D_FILE_OFFSET_BITS=64 to the makefile - // Daniel: it would seem odd to consider 32-bit machines when we assume AVX support! - off_t getPos() { - errno = 0; - off_t res = ftello(fd); - if (res < 0) { - stringstream err; - err << "Error getting file position, IO status: " << strerror(errno); - throw runtime_error(err.str()); - } - return res; + } + + /* + * Return false if no more data can be loaded. + * Throw an exception in the case of IO error. + */ + template bool loadIntegers(container &buffer) { + uint32_t qty = 0; + if (!ReadQty(qty)) + return false; // EOF + buffer.resize(qty); + errno = 0; + size_t result = fread(buffer.data(), sizeof(uint32_t), buffer.size(), fd); + if (result != buffer.size()) { + if (!errno) { + // If we can't read, the file maybe truncated, i.e., corrupt + throw runtime_error("The file appears to be truncated/corrupt!"); + } + stringstream err; + err << "Error reading from file, IO status: " << strerror(errno); + throw runtime_error(err.str()); } - - void setPos(off_t pos) { - errno = 0; - off_t res = fseeko(fd, pos, SEEK_SET); - if (res < 0) { - stringstream err; - err << "Error setting file position, IO status: " << strerror(errno); - throw runtime_error(err.str()); - } + return true; + } + + /* + * Return false if no more data can be loaded. + * Throw an exception in the case of IO error. + */ + bool readNextPosAndQty(off_t &pos, uint32_t &qty) { + pos = getPos(); + if (!ReadQty(qty)) + return false; // EOF + setPos(getPos() + qty * sizeof(uint32_t)); + return true; + } + + /** + * We must call open before we can use this class meaningfully. + */ + bool open() { + close(); + fd = ::fopen(mFilename.c_str(), "rb"); + if (fd == NULL) { + return false; } - - /* - * Return false if no more data can be loaded. - * Throw an exception in the case of IO error. - */ - template - bool loadIntegers(container &buffer) { - uint32_t qty = 0; - if (!ReadQty(qty)) return false; // EOF - buffer.resize(qty); - errno = 0; - size_t result = fread(buffer.data(), sizeof(uint32_t), buffer.size(), fd); - if (result != buffer.size()) { - if (!errno) { - // If we can't read, the file maybe truncated, i.e., corrupt - throw runtime_error("The file appears to be truncated/corrupt!"); - } - stringstream err; - err << "Error reading from file, IO status: " << strerror(errno); - throw runtime_error(err.str()); - } - return true; + setvbuf(fd, NULL, _IOFBF, 1024 * 4); // large buffer + return true; + } + + void close() { + if (fd != NULL) { + ::fclose(fd); + fd = NULL; } + } - /* - * Return false if no more data can be loaded. - * Throw an exception in the case of IO error. - */ - bool readNextPosAndQty(off_t &pos, uint32_t &qty) { - pos = getPos(); - if (!ReadQty(qty)) return false; // EOF - setPos(getPos() + qty * sizeof(uint32_t)); - return true; +private: + /* + * Returns false on EOF. + * Throws an exception in the case of IO error. + */ + bool ReadQty(uint32_t &qty) { + qty = 0; + if (fd == NULL) { + throw runtime_error("You forgot to open the file."); } - - /** - * We must call open before we can use this class meaningfully. - */ - bool open() { - close(); - fd = ::fopen(mFilename.c_str(), "rb"); - if (fd == NULL) { - return false; - } - setvbuf(fd , NULL , _IOFBF , 1024 * 4); // large buffer - return true; + errno = 0; + size_t result = fread(&qty, sizeof(qty), 1, fd); + if (errno) { + stringstream err; + err << "Error opening file, IO status: " << strerror(errno); + throw runtime_error(err.str()); } - - void close() { - if (fd != NULL) { - ::fclose(fd); - fd = NULL; - } + if (result != 1) { + return false; } -private: - /* - * Returns false on EOF. - * Throws an exception in the case of IO error. - */ - bool ReadQty(uint32_t &qty) { - qty = 0; - if (fd == NULL) { - throw runtime_error("You forgot to open the file."); - } - errno = 0; - size_t result = fread(&qty, sizeof(qty), 1, fd); - if (errno) { - stringstream err; - err << "Error opening file, IO status: " << strerror(errno); - throw runtime_error(err.str()); - } - if (result != 1) { - return false; - } - if (qty > 1 << 29) { - cout << "warning: reading a very large array (" << qty << " integers) : is your input file in the right format?" << - endl; - } - return true; + if (qty > 1 << 29) { + cout << "warning: reading a very large array (" << qty + << " integers) : is your input file in the right format?" << endl; } + return true; + } - string mFilename; - FILE *fd; - + string mFilename; + FILE *fd; }; #endif /* SIMDCompressionAndIntersection_MAROPUPARSER_H_ */ diff --git a/advancedbenchmarking/include/statisticsrecorder.h b/advancedbenchmarking/include/statisticsrecorder.h index e81bd66..db87011 100644 --- a/advancedbenchmarking/include/statisticsrecorder.h +++ b/advancedbenchmarking/include/statisticsrecorder.h @@ -11,82 +11,80 @@ #include "common.h" #include "timer.h" - class StatisticsRecorder { public: - StatisticsRecorder() : ss(), timer(), timesinmicros() { - ss << "# timeinms intersectionsize size1 size2 ..." << endl; - timesinmicros.reserve(5000);//40KB buffer - } - - /** - * Call this before each new query - */ - inline void prepareQuery() { - timer.reset(); - } - - /** - * Call this before after each query - */ - template - inline void endQuery(size_t intersectionsize, SizeProvider &sizeprovider, - const vector &PostIds) { - uint64_t timeinmicroseconds = timer.split(); - timesinmicros.push_back(timeinmicroseconds); - ss << timeinmicroseconds << "\t"; - ss << intersectionsize << "\t"; - for (uint32_t id : PostIds) { - ss << sizeprovider.getSizeInInts(id) << "\t"; - } - ss << endl; - } - - // average time in ms per query - double averageTimeInMS() const { - if (timesinmicros.size() == 0) return 0; - return static_cast(std::accumulate(timesinmicros.begin(), timesinmicros.end(), 0)) * 0.001 / - static_cast(timesinmicros.size()); - } - - - // average time in ms per query - double medianTimeInMS() const { - if (timesinmicros.size() == 0) return 0; - vector buffer(timesinmicros); - sort(buffer.begin(), buffer.end()); - return static_cast(buffer[buffer.size() / 2]) * 0.001; // not *exactly* the median but close enough - } - - // average time in ms per query - double ninetypercentileTimeInMS() const { - if (timesinmicros.size() == 0) return 0; - vector buffer(timesinmicros); - sort(buffer.begin(), buffer.end()); - return static_cast(buffer[static_cast(round(buffer.size() * 0.9))]) * - 0.001; // not *exactly* the 90 perc. but close enough + StatisticsRecorder() : ss(), timer(), timesinmicros() { + ss << "# timeinms intersectionsize size1 size2 ..." << endl; + timesinmicros.reserve(5000); // 40KB buffer + } + + /** + * Call this before each new query + */ + inline void prepareQuery() { timer.reset(); } + + /** + * Call this before after each query + */ + template + inline void endQuery(size_t intersectionsize, SizeProvider &sizeprovider, + const vector &PostIds) { + uint64_t timeinmicroseconds = timer.split(); + timesinmicros.push_back(timeinmicroseconds); + ss << timeinmicroseconds << "\t"; + ss << intersectionsize << "\t"; + for (uint32_t id : PostIds) { + ss << sizeprovider.getSizeInInts(id) << "\t"; } - - - /** - * Dump the CSV data - */ - void output(ostream &out) { - out << ss.str(); - } - - void reset() { - ss.str(""); - ss.seekp(0); - ss.seekg(0); - timesinmicros.clear(); - } - + ss << endl; + } + + // average time in ms per query + double averageTimeInMS() const { + if (timesinmicros.size() == 0) + return 0; + return static_cast( + std::accumulate(timesinmicros.begin(), timesinmicros.end(), 0)) * + 0.001 / static_cast(timesinmicros.size()); + } + + // average time in ms per query + double medianTimeInMS() const { + if (timesinmicros.size() == 0) + return 0; + vector buffer(timesinmicros); + sort(buffer.begin(), buffer.end()); + return static_cast(buffer[buffer.size() / 2]) * + 0.001; // not *exactly* the median but close enough + } + + // average time in ms per query + double ninetypercentileTimeInMS() const { + if (timesinmicros.size() == 0) + return 0; + vector buffer(timesinmicros); + sort(buffer.begin(), buffer.end()); + return static_cast( + buffer[static_cast(round(buffer.size() * 0.9))]) * + 0.001; // not *exactly* the 90 perc. but close enough + } + + /** + * Dump the CSV data + */ + void output(ostream &out) { out << ss.str(); } + + void reset() { + ss.str(""); + ss.seekp(0); + ss.seekg(0); + timesinmicros.clear(); + } private: - stringstream ss; - WallClockTimer timer; - vector timesinmicros; + stringstream ss; + WallClockTimer timer; + vector timesinmicros; }; #endif /* SIMDCompressionAndIntersection_STATISTICSRECORDER_H_ */ diff --git a/advancedbenchmarking/src/budgetedtest.cpp b/advancedbenchmarking/src/budgetedtest.cpp index 762308e..4e03cce 100644 --- a/advancedbenchmarking/src/budgetedtest.cpp +++ b/advancedbenchmarking/src/budgetedtest.cpp @@ -9,7 +9,6 @@ #include #include - #include "common.h" #include "util.h" #include "timer.h" @@ -23,34 +22,50 @@ using namespace SIMDCompressionLib; uint32_t FakeCheckSum(const uint32_t *p, size_t qty) { - return std::accumulate(p, p + qty, 0); + return std::accumulate(p, p + qty, 0); } void printusage(char *prog) { - cout << "Usage: " << prog << - " -b -s -l -i -o " - << endl; - cout << " -s specify compression scheme ( " ; - for (string s : CODECFactory::allNames()) cout << s << " "; - cout << ")" << endl; - cout << " (If no compression scheme is specified, we fall back on uncompressed postings.)" << endl; - cout << " -i specify intersection function ( " ; - for (string s : IntersectionFactory::allNames()) cout << s << " "; - cout << ")" << endl; - cout << " -o include one-word queries (default is do not include them)" << endl; - cout << " -q quiet mode, runs entire test before outputting results (default is verbose)" << endl; - cout << " -l allow you to specify a limit on the number of queries" << endl; - cout << " -p partition the postings during compression and decompression (param: number of partitions)" << endl; - cout << " -k to test Skipping (must provide a gap size param between 1 and 31)" << endl; - cout << " -B to activate bitmap mode (recommended values are in the range 8 to 32), when set to 0 in conjunction with compression, it means 'automatic' " - << endl; - cout << " -d to dump complete statistics " << endl; - cout << " -b memory budget in GB (recommended: less than half of your available memory) " << endl; + cout << "Usage: " << prog + << " -b " + " -s -l -i " + " -o " + << endl; + cout << " -s specify compression scheme ( "; + for (string s : CODECFactory::allNames()) + cout << s << " "; + cout << ")" << endl; + cout << " (If no compression scheme is specified, we fall back on " + "uncompressed postings.)" + << endl; + cout << " -i specify intersection function ( "; + for (string s : IntersectionFactory::allNames()) + cout << s << " "; + cout << ")" << endl; + cout + << " -o include one-word queries (default is do not include them)" + << endl; + cout << " -q quiet mode, runs entire test before outputting results " + "(default is verbose)" + << endl; + cout << " -l allow you to specify a limit on the number of queries" + << endl; + cout << " -p partition the postings during compression and " + "decompression (param: number of partitions)" + << endl; + cout << " -k to test Skipping (must provide a gap size param between " + "1 and 31)" + << endl; + cout << " -B to activate bitmap mode (recommended values are in the " + "range 8 to 32), when set to 0 in conjunction with compression, it " + "means 'automatic' " + << endl; + cout << " -d to dump complete statistics " << endl; + cout << " -b memory budget in GB (recommended: less than half of " + "your available memory) " + << endl; } - - - /** * This is a convenience class to run tests and recover statistics. * Basically, given a collection of queries, it will compress the relevant @@ -59,1231 +74,1339 @@ void printusage(char *prog) { */ class TestHelper { public: - TestHelper(IntegerCODEC &scheme, - intersectionfunction Inter) : - scheme(scheme), - CompressedSizeDuringPacking(0), - packVolume(0), unpackVolume(0), - packTime(0), unpackTime(0), interTime(0), - Inter(Inter), CoarsePackTime(0), - CoarseUnpackInterTime(0), + TestHelper(IntegerCODEC &scheme, intersectionfunction Inter) + : scheme(scheme), CompressedSizeDuringPacking(0), packVolume(0), + unpackVolume(0), packTime(0), unpackTime(0), interTime(0), Inter(Inter), + CoarsePackTime(0), CoarseUnpackInterTime(0), modifiesinput(CODECFactory::modifiesInputDuringCompression(scheme)), - MaxRecoveryBuffer(0), - SR() { - } - - /* - * The class reports cumulative results, but if you ever - * want to "zero" the statistics, you can call this method. - */ - void reset() { - CompressedSizeDuringPacking = 0; - packVolume = 0; - unpackVolume = 0; - packTime = 0; - unpackTime = 0; - interTime = 0; - CoarsePackTime = 0; - CoarseUnpackInterTime = 0; - MaxRecoveryBuffer = 0; - SR.reset(); + MaxRecoveryBuffer(0), SR() {} + + /* + * The class reports cumulative results, but if you ever + * want to "zero" the statistics, you can call this method. + */ + void reset() { + CompressedSizeDuringPacking = 0; + packVolume = 0; + unpackVolume = 0; + packTime = 0; + unpackTime = 0; + interTime = 0; + CoarsePackTime = 0; + CoarseUnpackInterTime = 0; + MaxRecoveryBuffer = 0; + SR.reset(); + } + + /** + * This is the main testing function. Posting lists are compressed, then + * uncompressed to RAM + * and intersected. + */ + void test(BudgetedPostingCollector &uncompPosts, + const vector> &allPostIds) { + unordered_map> compPostings; // we use a hash + // table to store + // the postings, + // ought to fit in + // RAM + size_t MaxPostingSize(0); + + vector dirtyCopy; + WallClockTimer z; // this is use only to time what we care + WallClockTimer coarsez; // for "coarse" timings, it includes many things we + // don't care about + // 1. Benchmark only compression + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + if (compPostings.find(id) != + compPostings + .end()) // these hits could potentially slow us down a bit + continue; // was compressed earlier + vector &onePost = uncompPosts.getOnePost(id); + size_t qty = onePost.size(); + vector compressedBuffer( + qty); // this is potential a bit expensive: memory allocation + if (MaxPostingSize < qty) { + MaxPostingSize = qty; + } + if (modifiesinput) // if true there is a copy which might be expensive + dirtyCopy = onePost; // some schemes might modify the input, hence it + // is necessary to make a copy in general + size_t nvalue = compressedBuffer.size(); + ////////////// + // BEGIN performance-sensitive section for *compression* + // (we don't care that much about it). + ///////////// + z.reset(); + if (modifiesinput) + scheme.encodeArray(dirtyCopy.data(), dirtyCopy.size(), + compressedBuffer.data(), nvalue); + else + scheme.encodeArray(onePost.data(), onePost.size(), + compressedBuffer.data(), nvalue); + packTime += static_cast(z.split()); + ///////////// + // END performance-sensitive code for *compression* + ///////////// + assert(nvalue <= compressedBuffer.size()); + packVolume += qty; + CompressedSizeDuringPacking += nvalue; + compressedBuffer.resize(nvalue); + compressedBuffer.shrink_to_fit(); // this may or may not be useful + compPostings.emplace(id, compressedBuffer); // with some luck, + // compressedBuffer is + // *moved* to the container, + // not copied + } } + CoarsePackTime += static_cast(coarsez.split()); - + vector recoveryBuffer(MaxPostingSize + 1024); + if (MaxRecoveryBuffer < recoveryBuffer.size()) + MaxRecoveryBuffer = recoveryBuffer.size(); + vector intersection_result(recoveryBuffer.size()); + // pre2: verify results /** - * This is the main testing function. Posting lists are compressed, then uncompressed to RAM - * and intersected. + * We first test that the compressed version can be uncompressed back + * to the original (duh!) */ - void test(BudgetedPostingCollector &uncompPosts, const vector> &allPostIds) { - unordered_map> - compPostings; // we use a hash table to store the postings, ought to fit in RAM - size_t MaxPostingSize(0); - - vector dirtyCopy; - WallClockTimer z;// this is use only to time what we care - WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about - // 1. Benchmark only compression - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - if (compPostings.find(id) != compPostings.end()) // these hits could potentially slow us down a bit - continue; // was compressed earlier - vector &onePost = uncompPosts.getOnePost(id); - size_t qty = onePost.size(); - vector compressedBuffer(qty); // this is potential a bit expensive: memory allocation - if (MaxPostingSize < qty) { - MaxPostingSize = qty; - } - if (modifiesinput) // if true there is a copy which might be expensive - dirtyCopy = onePost; // some schemes might modify the input, hence it is necessary to make a copy in general - size_t nvalue = compressedBuffer.size(); - ////////////// - // BEGIN performance-sensitive section for *compression* - // (we don't care that much about it). - ///////////// - z.reset(); - if (modifiesinput) - scheme.encodeArray(dirtyCopy.data(), dirtyCopy.size(), - compressedBuffer.data(), nvalue); - else - scheme.encodeArray(onePost.data(), onePost.size(), - compressedBuffer.data(), nvalue); - packTime += static_cast(z.split()); - ///////////// - // END performance-sensitive code for *compression* - ///////////// - assert(nvalue <= compressedBuffer.size()); - packVolume += qty; - CompressedSizeDuringPacking += nvalue; - compressedBuffer.resize(nvalue); - compressedBuffer.shrink_to_fit();// this may or may not be useful - compPostings.emplace(id, compressedBuffer);// with some luck, compressedBuffer is *moved* to the container, not copied - } - } - CoarsePackTime += static_cast(coarsez.split()); - - vector recoveryBuffer(MaxPostingSize + 1024); - if (MaxRecoveryBuffer < recoveryBuffer.size()) MaxRecoveryBuffer = recoveryBuffer.size(); - vector intersection_result(recoveryBuffer.size()); - // pre2: verify results - /** - * We first test that the compressed version can be uncompressed back - * to the original (duh!) - */ - for (unordered_map>::iterator i = compPostings.begin() ; i != compPostings.end(); ++i) { - const vector &compressed = i->second; - const vector &uncompressed = uncompPosts.getOnePost(i->first); - vector recbuffer(uncompressed.size() + 1024); - size_t recoveredsize = recbuffer.size(); - scheme.decodeArray(compressed.data(), compressed.size(), - recbuffer.data(), recoveredsize); - recbuffer.resize(recoveredsize); - if (recbuffer.size() != uncompressed.size()) { - cout << "Original array had size " << uncompressed.size() << endl; - cout << "Compressed size was " << compressed.size() << endl; - cout << "After decoding, the size was " << recoveredsize << endl; - throw runtime_error("Bug: don't even have same size!?!"); - } - if (recbuffer != uncompressed) { - cout << "size = " << uncompressed.size() << endl; - int display = 10; - for (size_t i = 0; i < uncompressed.size() ; ++i) { - if (uncompressed[i] != recbuffer[i]) { - cout << " i = " << i << " expected " << uncompressed[i] << " but got " << recbuffer[i] << endl; - display--; - if (display == 0) break; - } - } - - throw runtime_error("Bug: somehow, we can't recover the original."); - } - } - /** - * The next loop is just meant to test intersections - */ - for (const vector &qids : allPostIds) { - // we begin by recoving references to the posting lists. - vector *>> queryCompPost; - for (uint32_t id : qids) { - const vector &onePost = uncompPosts.getOnePost(id); - queryCompPost.emplace_back(make_pair(onePost.size(), & compPostings[id])); - } - assert(!queryCompPost.empty()); - // Sort in the order of increasing posting size - // Daniel: this is to make the SvS intersection faster... - sort(queryCompPost.begin(), queryCompPost.end()); - // the first posting list is a particular case, so we do it separately. - size_t intersectioncardinality = intersection_result.size(); - scheme.decodeArray(queryCompPost.front().second->data(), queryCompPost.front().second->size(), - intersection_result.data(), intersectioncardinality); - assert(intersectioncardinality <= intersection_result.size()); - for (size_t i = 1; (intersectioncardinality > 0) && (i < queryCompPost.size()); ++i) { - size_t recoveredsize = recoveryBuffer.size(); - scheme.decodeArray(queryCompPost[i].second->data(), queryCompPost[i].second->size(), - recoveryBuffer.data(), recoveredsize); - assert(recoveredsize <= recoveryBuffer.size()); - intersectioncardinality = Inter(intersection_result.data(), intersectioncardinality, - recoveryBuffer.data(), recoveredsize, intersection_result.data()); - } - vector trueintersection = uncompPosts.computeIntersection(qids, onesidedgallopingintersection); - if (trueintersection.size() != intersectioncardinality) { - cout << "expected cardinality: " << trueintersection.size() << endl; - cout << "actual cardinality: " << intersectioncardinality << endl; - for (auto v : queryCompPost) - cout << v.first << endl; - throw runtime_error("Intersection bug with write-back: you don't even have the correct cardinality."); - } - if (!equal(trueintersection.begin(), trueintersection.end(), intersection_result.begin())) { - cout << "intersectioncardinality=" << intersectioncardinality << endl; - cout << "number of lists=" << qids.size() << endl; - for (auto v : queryCompPost) - cout << v.first << endl; - throw runtime_error("you have a bug with intersections using write-backs!"); - } - } - /** - * End of testing - */ - // if these tests passed, then we should have the right answer. - coarsez.reset(); - // 2. Test full cycle (decompression + intersection) - for (const vector &qids : allPostIds) { - SR.prepareQuery(); - // we begin by recoving references to the posting lists. - vector *>> queryCompPost; - for (uint32_t id : qids) { - const vector &onePost = uncompPosts.getOnePost(id); - queryCompPost.emplace_back(make_pair(onePost.size(), & compPostings[id])); - } - assert(!queryCompPost.empty()); - // Sort in the order of increasing posting size - // Daniel: this is to make the SvS intersection faster... - // we don't time this operation - sort(queryCompPost.begin(), queryCompPost.end()); - // the first posting list is a particular case, so we do it separately. - - // we use pointers explicitely to me it easier for non-STL folks - const uint32_t *input = queryCompPost.front().second->data(); - size_t inputsize = queryCompPost.front().second->size(); - uint32_t *const intersectionbuffer = intersection_result.data(); - - ////////////////////// - // BEGIN performance-sensitive section - // Note that we possibly uncompress to RAM, and not to cache. - // Moreover, input might not be in cache. - ////////////////////// - z.reset(); - size_t intersectioncardinality = intersection_result.size(); - scheme.decodeArray(input, inputsize , - intersectionbuffer, intersectioncardinality); - unpackTime += static_cast(z.split()); - ///////////////////// - // END performance-sensitive section - ///////////////////// - assert(intersectioncardinality <= intersection_result.size()); - unpackVolume += intersectioncardinality; - uint32_t *const recoverybuffer = recoveryBuffer.data(); - for (size_t i = 1; (intersectioncardinality > 0) && (i < queryCompPost.size()); ++i) { - size_t recoveredsize = recoveryBuffer.size(); - // again we use explicit pointers to make it easier for non-STL people - input = queryCompPost[i].second->data(); - inputsize = queryCompPost[i].second->size(); - ///////////////////////// - // BEGIN performance-sensitive section - // Note that input might not be in cache, and that - // output might be to RAM - //////////////////////// - z.reset(); - scheme.decodeArray(input, inputsize, recoverybuffer , recoveredsize); - unpackTime += static_cast(z.split()); - ///////////////////////// - // END performance-sensitive section - ///////////////////////// - assert(recoveredsize <= recoveryBuffer.size()); - unpackVolume += recoveredsize; - ///////////////////////// - // BEGIN performance-sensitive section for intersections. - // Both inputs could be in RAM, not in cache. - //////////////////////// - z.reset(); - intersectioncardinality = Inter(intersectionbuffer, intersectioncardinality, - recoverybuffer, recoveredsize, intersectionbuffer); - interTime += static_cast(z.split()); - //////////////////////////////// - // END performance-sensitive section for intersections. - /////////////////////////////// - } - SR.endQuery(intersectioncardinality, uncompPosts, qids) ; + for (unordered_map>::iterator i = + compPostings.begin(); + i != compPostings.end(); ++i) { + const vector &compressed = i->second; + const vector &uncompressed = uncompPosts.getOnePost(i->first); + vector recbuffer(uncompressed.size() + 1024); + size_t recoveredsize = recbuffer.size(); + scheme.decodeArray(compressed.data(), compressed.size(), recbuffer.data(), + recoveredsize); + recbuffer.resize(recoveredsize); + if (recbuffer.size() != uncompressed.size()) { + cout << "Original array had size " << uncompressed.size() << endl; + cout << "Compressed size was " << compressed.size() << endl; + cout << "After decoding, the size was " << recoveredsize << endl; + throw runtime_error("Bug: don't even have same size!?!"); + } + if (recbuffer != uncompressed) { + cout << "size = " << uncompressed.size() << endl; + int display = 10; + for (size_t i = 0; i < uncompressed.size(); ++i) { + if (uncompressed[i] != recbuffer[i]) { + cout << " i = " << i << " expected " << uncompressed[i] + << " but got " << recbuffer[i] << endl; + display--; + if (display == 0) + break; + } } - CoarseUnpackInterTime += static_cast(coarsez.split()); - // Prevent from optimizing out - cout << "### Ignore: " - << (FakeCheckSum(recoveryBuffer.data(), recoveryBuffer.size()) + - FakeCheckSum(intersection_result.data(), intersection_result.size())) << endl; + throw runtime_error("Bug: somehow, we can't recover the original."); + } + } + /** + * The next loop is just meant to test intersections + */ + for (const vector &qids : allPostIds) { + // we begin by recoving references to the posting lists. + vector *>> queryCompPost; + for (uint32_t id : qids) { + const vector &onePost = uncompPosts.getOnePost(id); + queryCompPost.emplace_back( + make_pair(onePost.size(), &compPostings[id])); + } + assert(!queryCompPost.empty()); + // Sort in the order of increasing posting size + // Daniel: this is to make the SvS intersection faster... + sort(queryCompPost.begin(), queryCompPost.end()); + // the first posting list is a particular case, so we do it separately. + size_t intersectioncardinality = intersection_result.size(); + scheme.decodeArray(queryCompPost.front().second->data(), + queryCompPost.front().second->size(), + intersection_result.data(), intersectioncardinality); + assert(intersectioncardinality <= intersection_result.size()); + for (size_t i = 1; + (intersectioncardinality > 0) && (i < queryCompPost.size()); ++i) { + size_t recoveredsize = recoveryBuffer.size(); + scheme.decodeArray(queryCompPost[i].second->data(), + queryCompPost[i].second->size(), + recoveryBuffer.data(), recoveredsize); + assert(recoveredsize <= recoveryBuffer.size()); + intersectioncardinality = Inter( + intersection_result.data(), intersectioncardinality, + recoveryBuffer.data(), recoveredsize, intersection_result.data()); + } + vector trueintersection = + uncompPosts.computeIntersection(qids, onesidedgallopingintersection); + if (trueintersection.size() != intersectioncardinality) { + cout << "expected cardinality: " << trueintersection.size() << endl; + cout << "actual cardinality: " << intersectioncardinality << endl; + for (auto v : queryCompPost) + cout << v.first << endl; + throw runtime_error("Intersection bug with write-back: you don't even " + "have the correct cardinality."); + } + if (!equal(trueintersection.begin(), trueintersection.end(), + intersection_result.begin())) { + cout << "intersectioncardinality=" << intersectioncardinality << endl; + cout << "number of lists=" << qids.size() << endl; + for (auto v : queryCompPost) + cout << v.first << endl; + throw runtime_error( + "you have a bug with intersections using write-backs!"); + } } - - - - /** - * This is a version of "tests" where compressed posting lists are divided it up in NumberOfPartitions partitions. - * If NumberOfPartitions is large enough, uncompressed posting lists will reside in cache, not in RAM. + * End of testing */ + // if these tests passed, then we should have the right answer. + coarsez.reset(); + // 2. Test full cycle (decompression + intersection) + for (const vector &qids : allPostIds) { + SR.prepareQuery(); + // we begin by recoving references to the posting lists. + vector *>> queryCompPost; + for (uint32_t id : qids) { + const vector &onePost = uncompPosts.getOnePost(id); + queryCompPost.emplace_back( + make_pair(onePost.size(), &compPostings[id])); + } + assert(!queryCompPost.empty()); + // Sort in the order of increasing posting size + // Daniel: this is to make the SvS intersection faster... + // we don't time this operation + sort(queryCompPost.begin(), queryCompPost.end()); + // the first posting list is a particular case, so we do it separately. + + // we use pointers explicitely to me it easier for non-STL folks + const uint32_t *input = queryCompPost.front().second->data(); + size_t inputsize = queryCompPost.front().second->size(); + uint32_t *const intersectionbuffer = intersection_result.data(); + + ////////////////////// + // BEGIN performance-sensitive section + // Note that we possibly uncompress to RAM, and not to cache. + // Moreover, input might not be in cache. + ////////////////////// + z.reset(); + size_t intersectioncardinality = intersection_result.size(); + scheme.decodeArray(input, inputsize, intersectionbuffer, + intersectioncardinality); + unpackTime += static_cast(z.split()); + ///////////////////// + // END performance-sensitive section + ///////////////////// + assert(intersectioncardinality <= intersection_result.size()); + unpackVolume += intersectioncardinality; + uint32_t *const recoverybuffer = recoveryBuffer.data(); + for (size_t i = 1; + (intersectioncardinality > 0) && (i < queryCompPost.size()); ++i) { + size_t recoveredsize = recoveryBuffer.size(); + // again we use explicit pointers to make it easier for non-STL people + input = queryCompPost[i].second->data(); + inputsize = queryCompPost[i].second->size(); + ///////////////////////// + // BEGIN performance-sensitive section + // Note that input might not be in cache, and that + // output might be to RAM + //////////////////////// + z.reset(); + scheme.decodeArray(input, inputsize, recoverybuffer, recoveredsize); + unpackTime += static_cast(z.split()); + ///////////////////////// + // END performance-sensitive section + ///////////////////////// + assert(recoveredsize <= recoveryBuffer.size()); + unpackVolume += recoveredsize; + ///////////////////////// + // BEGIN performance-sensitive section for intersections. + // Both inputs could be in RAM, not in cache. + //////////////////////// + z.reset(); + intersectioncardinality = + Inter(intersectionbuffer, intersectioncardinality, recoverybuffer, + recoveredsize, intersectionbuffer); + interTime += static_cast(z.split()); + //////////////////////////////// + // END performance-sensitive section for intersections. + /////////////////////////////// + } + SR.endQuery(intersectioncardinality, uncompPosts, qids); + } + CoarseUnpackInterTime += static_cast(coarsez.split()); - void splittedtest(int NumberOfPartitions, - BudgetedPostingCollector &uncompPosts, - const vector> &allPostIds) { - // we index on posting ID, and point to a vector of compressed postings - unordered_map>> compPostings; - unordered_map> uncompsizes; - - pair range = uncompPosts.findDocumentIDRange( - allPostIds); - vector bounds(NumberOfPartitions + 1); - for (int part = 0; part < NumberOfPartitions; ++part) - bounds[part] = range.first + part * (range.second - range.first) - / NumberOfPartitions;// slightly uneven - bounds[NumberOfPartitions] = range.second + 1; - vector maxSize(NumberOfPartitions);// will contain the max size of a partition of a posting list - size_t TotalMaxPostingSize = 0; - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - vector &onePost = uncompPosts.getOnePost(id); - TotalMaxPostingSize = max(TotalMaxPostingSize, onePost.size()); - vector::iterator i = onePost.begin(); - for (int part = 0; part < NumberOfPartitions; ++part) { - vector::iterator j = lower_bound(i, onePost.end(), bounds[part + 1]); - uint32_t thissize = static_cast(j - i); - if (thissize > maxSize[part]) maxSize[part] = thissize; - i = j; - } - } - } - uint32_t AbsoluteMaxSize = *max_element(maxSize.begin(), maxSize.end()); - WallClockTimer z;// this is use only to time what we care - WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about - // 1. Benchmark only compression - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - if (compPostings.find(id) != compPostings.end()) // these hits could potentially slow us down a bit - continue; // was compressed earlier - vector &onePost = uncompPosts.getOnePost(id); - ////////////// - // BEGIN performance-sensitive section for *compression* - // (we don't care that much about it). - ///////////// - z.reset(); - - vector::iterator i = onePost.begin(); - vector> subposts(NumberOfPartitions); - vector subsizes(NumberOfPartitions); - uint32_t sanitycheck = 0; - for (int part = 0; (i != onePost.end()) && (part < NumberOfPartitions); ++part) { - vector compressedBuffer; - vector::iterator j = lower_bound(i, onePost.end(), bounds[part + 1]); - uint32_t thissize = static_cast(j - i); - if (j != onePost.end()) assert(*j >= bounds[part + 1]); - assert(*i >= bounds[part]); - subsizes[part] = thissize; - sanitycheck += thissize; - if (thissize != 0) { - compressedBuffer.resize(thissize + 128); - size_t nvalue = compressedBuffer.size(); - vector dirtyCopy(i, j); // we make a copy because (1) some schemes modify input (2) we need 128-bit alignment - scheme.encodeArray(dirtyCopy.data(), thissize, - compressedBuffer.data(), nvalue); - compressedBuffer.resize(nvalue); - packVolume += thissize; - CompressedSizeDuringPacking += nvalue; - } - subposts[part] = compressedBuffer; - i = j; - - } - assert(i == onePost.end()); - assert(sanitycheck == onePost.size()); - compPostings.emplace(id, subposts); - uncompsizes.emplace(id, subsizes); - packTime += static_cast(z.split()); - } - } - CoarsePackTime += static_cast(coarsez.split()); - vector recoveryBuffer(AbsoluteMaxSize); - if (MaxRecoveryBuffer < AbsoluteMaxSize) MaxRecoveryBuffer = AbsoluteMaxSize; - vector total_intersection_result(TotalMaxPostingSize + 1024); - vector > intersection_result; - for (uint32_t size : maxSize) { - vector intersectionbuffer(size + 128); - intersection_result.emplace_back(intersectionbuffer); - } + // Prevent from optimizing out + cout << "### Ignore: " + << (FakeCheckSum(recoveryBuffer.data(), recoveryBuffer.size()) + + FakeCheckSum(intersection_result.data(), + intersection_result.size())) + << endl; + } + /** + * This is a version of "tests" where compressed posting lists are divided it + * up in NumberOfPartitions partitions. + * If NumberOfPartitions is large enough, uncompressed posting lists will + * reside in cache, not in RAM. + */ - // 2. Test full cycle (decompression + intersection) - // if 0 == phase, we test the correctness of the algorithm. - // if 1 == phase, we test performance - for (int phase = 0; phase < 2; ++ phase) { - if (1 == phase) coarsez.reset(); - - for (const vector &qids : allPostIds) { - if (1 == phase) SR.prepareQuery(); - size_t totalintercardinality = 0; - for (int part = 0; part < NumberOfPartitions; ++part) { - vector *>> queryCompPost; - for (uint32_t id : qids) { - uint32_t myuncompsize = uncompsizes[id][part]; - const vector *compressedposting = & compPostings[id][part]; - queryCompPost.emplace_back(make_pair(myuncompsize, compressedposting)); - } - sort(queryCompPost.begin(), queryCompPost.end()); - const uint32_t *input = queryCompPost.front().second->data(); - size_t inputsize = queryCompPost.front().second->size(); - uint32_t *const intersectionbuffer = intersection_result[part].data(); - if (1 == phase) z.reset(); - size_t intersectioncardinality = intersection_result[part].size(); - if (inputsize == 0) - intersectioncardinality = 0; - else - scheme.decodeArray(input, inputsize , - intersectionbuffer, intersectioncardinality); - if (1 == phase) { - unpackTime += static_cast(z.split()); - unpackVolume += intersectioncardinality; - } - assert(intersectioncardinality <= intersection_result[part].size()); - uint32_t *const recoverybuffer = recoveryBuffer.data(); - for (size_t i = 1; (intersectioncardinality > 0) && (i < queryCompPost.size()); ++i) { - // again we use explicit pointers to make it easier for non-STL people - size_t recoveredsize = recoveryBuffer.size(); - input = queryCompPost[i].second->data(); - inputsize = queryCompPost[i].second->size(); - ///////////////////////// - // BEGIN performance-sensitive section - // Note that input might not be in cache, and that - // output might be to RAM - //////////////////////// - if (1 == phase) z.reset(); - scheme.decodeArray(input, inputsize, recoverybuffer , recoveredsize); - if (1 == phase) { - unpackTime += static_cast(z.split()); - unpackVolume += recoveredsize; - } - assert(recoveredsize <= recoveryBuffer.size()); - ///////////////////////// - // END performance-sensitive section - ///////////////////////// - ///////////////////////// - // BEGIN performance-sensitive section for intersections. - // Both inputs could be in RAM, not in cache. - //////////////////////// - if (1 == phase) z.reset(); - intersectioncardinality = Inter(intersectionbuffer, intersectioncardinality, - recoverybuffer, recoveredsize, intersectionbuffer); - if (1 == phase) interTime += static_cast(z.split()); - //////////////////////////////// - // END performance-sensitive section for intersections. - /////////////////////////////// - } - // Saving the result - copy(intersectionbuffer, - intersectionbuffer + intersectioncardinality, - total_intersection_result.begin() + totalintercardinality); - totalintercardinality += intersectioncardinality; - } - if (1 == phase) SR.endQuery(totalintercardinality, uncompPosts, qids); - if (0 == phase) { - vector trueintersection = uncompPosts.computeIntersection(qids, onesidedgallopingintersection); - if (trueintersection.size() != totalintercardinality) { - stringstream err; - err << "Different cardinality, expected: " - << trueintersection.size() - << " got: " << totalintercardinality; - throw runtime_error(err.str()); - } - for (uint32_t k = 0; k < trueintersection.size(); ++k) { - if (trueintersection[k] != total_intersection_result[k]) { - throw runtime_error("intersection bug"); - } - } - } - } - if (1 == phase) CoarseUnpackInterTime += static_cast(coarsez.split()); + void splittedtest(int NumberOfPartitions, + BudgetedPostingCollector &uncompPosts, + const vector> &allPostIds) { + // we index on posting ID, and point to a vector of compressed postings + unordered_map>> compPostings; + unordered_map> uncompsizes; + + pair range = + uncompPosts.findDocumentIDRange(allPostIds); + vector bounds(NumberOfPartitions + 1); + for (int part = 0; part < NumberOfPartitions; ++part) + bounds[part] = range.first + + part * (range.second - range.first) / + NumberOfPartitions; // slightly uneven + bounds[NumberOfPartitions] = range.second + 1; + vector maxSize(NumberOfPartitions); // will contain the max size + // of a partition of a posting + // list + size_t TotalMaxPostingSize = 0; + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + vector &onePost = uncompPosts.getOnePost(id); + TotalMaxPostingSize = max(TotalMaxPostingSize, onePost.size()); + vector::iterator i = onePost.begin(); + for (int part = 0; part < NumberOfPartitions; ++part) { + vector::iterator j = + lower_bound(i, onePost.end(), bounds[part + 1]); + uint32_t thissize = static_cast(j - i); + if (thissize > maxSize[part]) + maxSize[part] = thissize; + i = j; } - - // Prevent from optimizing out - cout << "### Ignore: " - << FakeCheckSum(total_intersection_result.data(), total_intersection_result.size()) << endl; + } } - - void skippingtest(int SkipLog, BudgetedPostingCollector &uncompPosts, - const vector> &allPostIds) { - unordered_map> - compPostings; // we use a hash table to store the postings, ought to fit in RAM - size_t MaxPostingSize(0); - - WallClockTimer z;// this is use only to time what we care - WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about - // 1. Benchmark only compression - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - if (compPostings.find(id) != compPostings.end()) // these hits could potentially slow us down a bit - continue; // was compressed earlier - vector &onePost = uncompPosts.getOnePost(id); - z.reset(); - compPostings.emplace(id , shared_ptr(new Skipping(SkipLog, onePost.data(), onePost.size()))); - packTime += static_cast(z.split()); - size_t qty = onePost.size(); - if (MaxPostingSize < qty) { - MaxPostingSize = qty; - } - packVolume += qty; - CompressedSizeDuringPacking += (compPostings[id]->storageInBytes() + sizeof(uint32_t) - 1) / sizeof(uint32_t); - } + uint32_t AbsoluteMaxSize = *max_element(maxSize.begin(), maxSize.end()); + WallClockTimer z; // this is use only to time what we care + WallClockTimer coarsez; // for "coarse" timings, it includes many things we + // don't care about + // 1. Benchmark only compression + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + if (compPostings.find(id) != + compPostings + .end()) // these hits could potentially slow us down a bit + continue; // was compressed earlier + vector &onePost = uncompPosts.getOnePost(id); + ////////////// + // BEGIN performance-sensitive section for *compression* + // (we don't care that much about it). + ///////////// + z.reset(); + + vector::iterator i = onePost.begin(); + vector> subposts(NumberOfPartitions); + vector subsizes(NumberOfPartitions); + uint32_t sanitycheck = 0; + for (int part = 0; (i != onePost.end()) && (part < NumberOfPartitions); + ++part) { + vector compressedBuffer; + vector::iterator j = + lower_bound(i, onePost.end(), bounds[part + 1]); + uint32_t thissize = static_cast(j - i); + if (j != onePost.end()) + assert(*j >= bounds[part + 1]); + assert(*i >= bounds[part]); + subsizes[part] = thissize; + sanitycheck += thissize; + if (thissize != 0) { + compressedBuffer.resize(thissize + 128); + size_t nvalue = compressedBuffer.size(); + vector dirtyCopy(i, j); // we make a copy because (1) some + // schemes modify input (2) we + // need 128-bit alignment + scheme.encodeArray(dirtyCopy.data(), thissize, + compressedBuffer.data(), nvalue); + compressedBuffer.resize(nvalue); + packVolume += thissize; + CompressedSizeDuringPacking += nvalue; + } + subposts[part] = compressedBuffer; + i = j; } - CoarsePackTime += static_cast(coarsez.split()); - vector recoveryBuffer(MaxPostingSize + 1024); - if (MaxRecoveryBuffer < recoveryBuffer.size()) - MaxRecoveryBuffer = recoveryBuffer.size(); - vector intersection_result(recoveryBuffer.size()); + assert(i == onePost.end()); + assert(sanitycheck == onePost.size()); + compPostings.emplace(id, subposts); + uncompsizes.emplace(id, subsizes); + packTime += static_cast(z.split()); + } + } + CoarsePackTime += static_cast(coarsez.split()); + vector recoveryBuffer(AbsoluteMaxSize); + if (MaxRecoveryBuffer < AbsoluteMaxSize) + MaxRecoveryBuffer = AbsoluteMaxSize; + vector total_intersection_result(TotalMaxPostingSize + 1024); + vector> intersection_result; + for (uint32_t size : maxSize) { + vector intersectionbuffer(size + 128); + intersection_result.emplace_back(intersectionbuffer); + } + + // 2. Test full cycle (decompression + intersection) + // if 0 == phase, we test the correctness of the algorithm. + // if 1 == phase, we test performance + for (int phase = 0; phase < 2; ++phase) { + if (1 == phase) coarsez.reset(); - for (vector qids : allPostIds) { - if (qids.empty()) continue; // odd but could happen? - SR.prepareQuery(); - vector> sizeids; - for (uint32_t i : qids) { - sizeids.emplace_back(make_pair(compPostings[i]->Length, i)); + + for (const vector &qids : allPostIds) { + if (1 == phase) + SR.prepareQuery(); + size_t totalintercardinality = 0; + for (int part = 0; part < NumberOfPartitions; ++part) { + vector *>> queryCompPost; + for (uint32_t id : qids) { + uint32_t myuncompsize = uncompsizes[id][part]; + const vector *compressedposting = &compPostings[id][part]; + queryCompPost.emplace_back( + make_pair(myuncompsize, compressedposting)); + } + sort(queryCompPost.begin(), queryCompPost.end()); + const uint32_t *input = queryCompPost.front().second->data(); + size_t inputsize = queryCompPost.front().second->size(); + uint32_t *const intersectionbuffer = intersection_result[part].data(); + if (1 == phase) + z.reset(); + size_t intersectioncardinality = intersection_result[part].size(); + if (inputsize == 0) + intersectioncardinality = 0; + else + scheme.decodeArray(input, inputsize, intersectionbuffer, + intersectioncardinality); + if (1 == phase) { + unpackTime += static_cast(z.split()); + unpackVolume += intersectioncardinality; + } + assert(intersectioncardinality <= intersection_result[part].size()); + uint32_t *const recoverybuffer = recoveryBuffer.data(); + for (size_t i = 1; + (intersectioncardinality > 0) && (i < queryCompPost.size()); + ++i) { + // again we use explicit pointers to make it easier for non-STL + // people + size_t recoveredsize = recoveryBuffer.size(); + input = queryCompPost[i].second->data(); + inputsize = queryCompPost[i].second->size(); + ///////////////////////// + // BEGIN performance-sensitive section + // Note that input might not be in cache, and that + // output might be to RAM + //////////////////////// + if (1 == phase) + z.reset(); + scheme.decodeArray(input, inputsize, recoverybuffer, recoveredsize); + if (1 == phase) { + unpackTime += static_cast(z.split()); + unpackVolume += recoveredsize; } - sort(sizeids.begin(), sizeids.end()); - size_t intersize; - if (sizeids.size() == 1) { - intersize = compPostings[sizeids.front().second]->decompress(intersection_result.data()); - unpackVolume += intersize; - } else { - assert(compPostings.size() >= 2); - intersize = compPostings[sizeids[0].second]->intersect(*compPostings[sizeids[1].second], intersection_result.data()); - unpackVolume += compPostings[sizeids[0].second]->Length; - unpackVolume += compPostings[sizeids[1].second]->Length; - for (size_t k = 2; (intersize > 0) && (k < sizeids.size()); ++k) { - unpackVolume += compPostings[sizeids[k].second]->Length; - intersize = compPostings[sizeids[k].second]->intersect(intersection_result.data(), intersize, - intersection_result.data()); - } + assert(recoveredsize <= recoveryBuffer.size()); + ///////////////////////// + // END performance-sensitive section + ///////////////////////// + ///////////////////////// + // BEGIN performance-sensitive section for intersections. + // Both inputs could be in RAM, not in cache. + //////////////////////// + if (1 == phase) + z.reset(); + intersectioncardinality = + Inter(intersectionbuffer, intersectioncardinality, + recoverybuffer, recoveredsize, intersectionbuffer); + if (1 == phase) + interTime += static_cast(z.split()); + //////////////////////////////// + // END performance-sensitive section for intersections. + /////////////////////////////// + } + // Saving the result + copy(intersectionbuffer, intersectionbuffer + intersectioncardinality, + total_intersection_result.begin() + totalintercardinality); + totalintercardinality += intersectioncardinality; + } + if (1 == phase) + SR.endQuery(totalintercardinality, uncompPosts, qids); + if (0 == phase) { + vector trueintersection = uncompPosts.computeIntersection( + qids, onesidedgallopingintersection); + if (trueintersection.size() != totalintercardinality) { + stringstream err; + err << "Different cardinality, expected: " + << trueintersection.size() << " got: " << totalintercardinality; + throw runtime_error(err.str()); + } + for (uint32_t k = 0; k < trueintersection.size(); ++k) { + if (trueintersection[k] != total_intersection_result[k]) { + throw runtime_error("intersection bug"); } - SR.endQuery(intersize, uncompPosts, qids); + } } + } + if (1 == phase) CoarseUnpackInterTime += static_cast(coarsez.split()); - - // Prevent from optimizing out - cout << "### Ignore: " - << (FakeCheckSum(recoveryBuffer.data(), recoveryBuffer.size()) + - FakeCheckSum(intersection_result.data(), intersection_result.size())) << endl; } - /* - * If NumberOfPartitions == 0, we call a well-tested function with no partitions - */ - void bitmaptest(int NumberOfPartitions, - uint32_t th, BudgetedPostingCollector &uncompPosts, + // Prevent from optimizing out + cout << "### Ignore: " << FakeCheckSum(total_intersection_result.data(), + total_intersection_result.size()) + << endl; + } + + void skippingtest(int SkipLog, BudgetedPostingCollector &uncompPosts, const vector> &allPostIds) { - if (NumberOfPartitions > 1) { - bitmaptestsplit(NumberOfPartitions, th, uncompPosts, allPostIds); - } else { - bitmaptestnosplit(th, uncompPosts, allPostIds); + unordered_map> compPostings; // we use a hash + // table to + // store the + // postings, + // ought to fit + // in RAM + size_t MaxPostingSize(0); + + WallClockTimer z; // this is use only to time what we care + WallClockTimer coarsez; // for "coarse" timings, it includes many things we + // don't care about + // 1. Benchmark only compression + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + if (compPostings.find(id) != + compPostings + .end()) // these hits could potentially slow us down a bit + continue; // was compressed earlier + vector &onePost = uncompPosts.getOnePost(id); + z.reset(); + compPostings.emplace(id, shared_ptr(new Skipping( + SkipLog, onePost.data(), onePost.size()))); + packTime += static_cast(z.split()); + size_t qty = onePost.size(); + if (MaxPostingSize < qty) { + MaxPostingSize = qty; } + packVolume += qty; + CompressedSizeDuringPacking += + (compPostings[id]->storageInBytes() + sizeof(uint32_t) - 1) / + sizeof(uint32_t); + } } - - void bitmaptestsplit(int NumberOfPartitions, - uint32_t th, BudgetedPostingCollector &uncompPosts, - const vector> &allPostIds) { - pair range = uncompPosts.findDocumentIDRange(allPostIds); - vector recovbufferHybrid; - vector> hybridPart(NumberOfPartitions); - vector MaxRecoveryBufferPart(NumberOfPartitions + 1); - - vector bounds(NumberOfPartitions + 1); - - for (int part = 0; part < NumberOfPartitions; ++part) { - bounds[part] = range.first + part * (range.second - range.first) - / NumberOfPartitions;// slightly uneven + CoarsePackTime += static_cast(coarsez.split()); + vector recoveryBuffer(MaxPostingSize + 1024); + if (MaxRecoveryBuffer < recoveryBuffer.size()) + MaxRecoveryBuffer = recoveryBuffer.size(); + vector intersection_result(recoveryBuffer.size()); + coarsez.reset(); + for (vector qids : allPostIds) { + if (qids.empty()) + continue; // odd but could happen? + SR.prepareQuery(); + vector> sizeids; + for (uint32_t i : qids) { + sizeids.emplace_back(make_pair(compPostings[i]->Length, i)); + } + sort(sizeids.begin(), sizeids.end()); + size_t intersize; + if (sizeids.size() == 1) { + intersize = compPostings[sizeids.front().second]->decompress( + intersection_result.data()); + unpackVolume += intersize; + } else { + assert(compPostings.size() >= 2); + intersize = compPostings[sizeids[0].second]->intersect( + *compPostings[sizeids[1].second], intersection_result.data()); + unpackVolume += compPostings[sizeids[0].second]->Length; + unpackVolume += compPostings[sizeids[1].second]->Length; + for (size_t k = 2; (intersize > 0) && (k < sizeids.size()); ++k) { + unpackVolume += compPostings[sizeids[k].second]->Length; + intersize = compPostings[sizeids[k].second]->intersect( + intersection_result.data(), intersize, + intersection_result.data()); } - bounds[NumberOfPartitions] = range.second + 1; - - for (int i = 0; i < NumberOfPartitions; ++i) { - hybridPart[i] = unique_ptr(new HybM2(scheme, Inter, - bounds[i + 1] - bounds[i], - recovbufferHybrid, th - )); - } - - WallClockTimer z;// this is use only to time what we care - WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about - - size_t TotalMaxPostingSize = 0; - size_t MaxPostingSizePart = 0; - - // 1. Benchmark only compression - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - vector &onePost = uncompPosts.getOnePost(id); - - - if (TotalMaxPostingSize < onePost.size()) { - TotalMaxPostingSize = onePost.size(); - } - - vector::iterator i = onePost.begin(); - uint32_t sanitycheck = 0; - for (int part = 0; (part < NumberOfPartitions); ++part) { - if (i == onePost.end()) { - /* - * We need to load the empty posting or the intersection - * will fail - */ - if (!hybridPart[part]->hasBeenLoaded(id)) { // wasn't compressed earlier - static vector emptyPost; // thread-safe in C++ 11 - - z.reset(); // actually this should be very quick - CompressedSizeDuringPacking += hybridPart[part]-> - load(id, emptyPost.data(), emptyPost.size()); - packTime += static_cast(z.split()); - - packVolume += 0; - } - continue; - } - vector::iterator j = lower_bound(i, onePost.end(), bounds[part + 1]); - uint32_t thissize = static_cast(j - i); - if (MaxPostingSizePart < thissize) { - MaxPostingSizePart = thissize; - } - if (j != onePost.end()) assert(*j >= bounds[part + 1]); - assert(*i >= bounds[part]); - sanitycheck += thissize; - - if (!hybridPart[part]->hasBeenLoaded(id)) { // wasn't compressed earlier - ////////////// - // BEGIN performance-sensitive section for *compression* - // (we don't care that much about it). - ///////////// - - z.reset(); - /* - * We make a copy because - * (1) we need to subtract bounds[part] - * (2) some schemes modify input - * (3) we need 128-bit alignment - */ - vector dirtyCopy(i, j); - - /* - * This might be a bit suboptimal. - * However Leo expects that GCC 4.7 - * would vectorize this operation. - * Anyways, we don't care much - * about compression speeds. - */ - - size_t minId = bounds[part]; - size_t qty = dirtyCopy.size(); - - for (size_t idx = 0; idx < qty; ++idx) { - dirtyCopy[idx] -= minId; - } - - CompressedSizeDuringPacking += hybridPart[part]-> - load(id, dirtyCopy.data(), dirtyCopy.size()); - packTime += static_cast(z.split()); - - packVolume += thissize; - ////////////// - // END performance-sensitive section for *compression* - ////////////// - } - i = j; - - if (MaxRecoveryBufferPart[part] < hybridPart[part]->sizeOfRecoveryBufferInWords()) - MaxRecoveryBufferPart[part] = hybridPart[part]->sizeOfRecoveryBufferInWords(); - } - assert(i == onePost.end()); - assert(sanitycheck == onePost.size()); - packTime += static_cast(z.split()); - } - } - CoarsePackTime += static_cast(coarsez.split()); + } + SR.endQuery(intersize, uncompPosts, qids); + } + CoarseUnpackInterTime += static_cast(coarsez.split()); - MaxRecoveryBuffer = *max_element(MaxRecoveryBufferPart.begin(), - MaxRecoveryBufferPart.end()); + // Prevent from optimizing out + cout << "### Ignore: " + << (FakeCheckSum(recoveryBuffer.data(), recoveryBuffer.size()) + + FakeCheckSum(intersection_result.data(), + intersection_result.size())) + << endl; + } + + /* + * If NumberOfPartitions == 0, we call a well-tested function with no + * partitions + */ + void bitmaptest(int NumberOfPartitions, uint32_t th, + BudgetedPostingCollector &uncompPosts, + const vector> &allPostIds) { + if (NumberOfPartitions > 1) { + bitmaptestsplit(NumberOfPartitions, th, uncompPosts, allPostIds); + } else { + bitmaptestnosplit(th, uncompPosts, allPostIds); + } + } + + void bitmaptestsplit(int NumberOfPartitions, uint32_t th, + BudgetedPostingCollector &uncompPosts, + const vector> &allPostIds) { + pair range = + uncompPosts.findDocumentIDRange(allPostIds); + vector recovbufferHybrid; + vector> hybridPart(NumberOfPartitions); + vector MaxRecoveryBufferPart(NumberOfPartitions + 1); + + vector bounds(NumberOfPartitions + 1); + + for (int part = 0; part < NumberOfPartitions; ++part) { + bounds[part] = range.first + + part * (range.second - range.first) / + NumberOfPartitions; // slightly uneven + } + bounds[NumberOfPartitions] = range.second + 1; - vector intersection_result(MaxPostingSizePart + 1024); - vector total_intersection_result(TotalMaxPostingSize + 1024); + for (int i = 0; i < NumberOfPartitions; ++i) { + hybridPart[i] = unique_ptr(new HybM2( + scheme, Inter, bounds[i + 1] - bounds[i], recovbufferHybrid, th)); + } -#if 1 - // testing round, does the algorithm have errors? - for (vector qids : allPostIds) { - size_t total_sizeout = 0; + WallClockTimer z; // this is use only to time what we care + WallClockTimer coarsez; // for "coarse" timings, it includes many things we + // don't care about - for (int part = 0; part < NumberOfPartitions; ++part) { - size_t sizeout = intersection_result.size(); - hybridPart[part]->intersect(qids, intersection_result.data(), sizeout); + size_t TotalMaxPostingSize = 0; + size_t MaxPostingSizePart = 0; - for (size_t k = total_sizeout; k < total_sizeout + sizeout; ++k) - total_intersection_result[k] = intersection_result[k - total_sizeout] + bounds[part]; - total_sizeout += sizeout; - } + // 1. Benchmark only compression + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + vector &onePost = uncompPosts.getOnePost(id); - vector trueintersection = uncompPosts.computeIntersection(qids, onesidedgallopingintersection); - if (trueintersection.size() != total_sizeout) { - stringstream err; - err << "Different cardinality, expected: " - << trueintersection.size() - << " got: " << total_sizeout; - throw runtime_error(err.str()); - } - for (uint32_t k = 0; k < trueintersection.size(); ++k) { - if (trueintersection[k] != total_intersection_result[k]) { - throw runtime_error("intersection bug"); - } - } + if (TotalMaxPostingSize < onePost.size()) { + TotalMaxPostingSize = onePost.size(); } -#endif - coarsez.reset(); - - // 2. Finally benchmarking - for (vector qids : allPostIds) { - size_t total_sizeout = 0; - - SR.prepareQuery(); - for (int part = 0; part < NumberOfPartitions; ++part) { - size_t sizeout = 0; - - // Intersect will fail if there is an empty list - sizeout = intersection_result.size(); - unpackVolume += hybridPart[part]->intersect(qids, - intersection_result.data(), sizeout); - for (size_t k = total_sizeout; k < total_sizeout + sizeout; ++k) - total_intersection_result[k] = intersection_result[k - total_sizeout] + bounds[part]; - total_sizeout += sizeout; + vector::iterator i = onePost.begin(); + uint32_t sanitycheck = 0; + for (int part = 0; (part < NumberOfPartitions); ++part) { + if (i == onePost.end()) { + /* + * We need to load the empty posting or the intersection + * will fail + */ + if (!hybridPart[part]->hasBeenLoaded( + id)) { // wasn't compressed earlier + static vector emptyPost; // thread-safe in C++ 11 + + z.reset(); // actually this should be very quick + CompressedSizeDuringPacking += hybridPart[part]->load( + id, emptyPost.data(), emptyPost.size()); + packTime += static_cast(z.split()); + + packVolume += 0; } - SR.endQuery(total_sizeout, uncompPosts, qids); - } - - CoarseUnpackInterTime += static_cast(coarsez.split()); - - // Prevent from optimizing out - cout << "### Ignore: " - << FakeCheckSum(total_intersection_result.data(), - total_intersection_result.size()) << endl; - - } + continue; + } + vector::iterator j = + lower_bound(i, onePost.end(), bounds[part + 1]); + uint32_t thissize = static_cast(j - i); + if (MaxPostingSizePart < thissize) { + MaxPostingSizePart = thissize; + } + if (j != onePost.end()) + assert(*j >= bounds[part + 1]); + assert(*i >= bounds[part]); + sanitycheck += thissize; + + if (!hybridPart[part]->hasBeenLoaded( + id)) { // wasn't compressed earlier + ////////////// + // BEGIN performance-sensitive section for *compression* + // (we don't care that much about it). + ///////////// - void bitmaptestnosplit(uint32_t th, BudgetedPostingCollector &uncompPosts, - const vector> &allPostIds) { - uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; - size_t MaxPostingSize(0); - vector recovbufferHybrid; - HybM2 hybrid(scheme, Inter, MaxId, recovbufferHybrid, th); - WallClockTimer z;// this is use only to time what we care - WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about - // 1. Benchmark only compression - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - if (hybrid.hasBeenLoaded(id)) continue; - vector &onePost = uncompPosts.getOnePost(id); - z.reset(); - CompressedSizeDuringPacking += hybrid.load(id, onePost.data(), onePost.size()); - packTime += static_cast(z.split()); - size_t qty = onePost.size(); - packVolume += qty; - if (MaxPostingSize < qty) { - MaxPostingSize = qty; - } + z.reset(); + /* + * We make a copy because + * (1) we need to subtract bounds[part] + * (2) some schemes modify input + * (3) we need 128-bit alignment + */ + vector dirtyCopy(i, j); + + /* + * This might be a bit suboptimal. + * However Leo expects that GCC 4.7 + * would vectorize this operation. + * Anyways, we don't care much + * about compression speeds. + */ + + size_t minId = bounds[part]; + size_t qty = dirtyCopy.size(); + + for (size_t idx = 0; idx < qty; ++idx) { + dirtyCopy[idx] -= minId; } - } - CoarsePackTime += static_cast(coarsez.split()); - vector intersection_result(MaxPostingSize + 1024); - if (MaxRecoveryBuffer < hybrid.sizeOfRecoveryBufferInWords()) - MaxRecoveryBuffer = hybrid.sizeOfRecoveryBufferInWords(); - // testing round - for (vector qids : allPostIds) { - size_t sizeout = intersection_result.size(); - hybrid.intersect(qids, intersection_result.data(), sizeout); - vector trueintersection = uncompPosts.computeIntersection(qids, onesidedgallopingintersection); - if (trueintersection.size() != sizeout) throw runtime_error("not even same cardinality"); - for (uint32_t k = 0; k < sizeout; ++k) - if (trueintersection[k] != intersection_result[k]) throw runtime_error("intersection bug"); - } - coarsez.reset(); - for (vector qids : allPostIds) { - SR.prepareQuery(); - size_t sizeout = intersection_result.size(); - unpackVolume += hybrid.intersect(qids, intersection_result.data(), sizeout); - SR.endQuery(sizeout, uncompPosts, qids); + CompressedSizeDuringPacking += + hybridPart[part]->load(id, dirtyCopy.data(), dirtyCopy.size()); + packTime += static_cast(z.split()); + + packVolume += thissize; + ////////////// + // END performance-sensitive section for *compression* + ////////////// + } + i = j; + + if (MaxRecoveryBufferPart[part] < + hybridPart[part]->sizeOfRecoveryBufferInWords()) + MaxRecoveryBufferPart[part] = + hybridPart[part]->sizeOfRecoveryBufferInWords(); } - CoarseUnpackInterTime += static_cast(coarsez.split()); - - // Prevent from optimizing out - cout << "### Ignore: " - << FakeCheckSum(intersection_result.data(), - intersection_result.size()) << endl; + assert(i == onePost.end()); + assert(sanitycheck == onePost.size()); + packTime += static_cast(z.split()); + } } + CoarsePackTime += static_cast(coarsez.split()); + MaxRecoveryBuffer = *max_element(MaxRecoveryBufferPart.begin(), + MaxRecoveryBufferPart.end()); + vector intersection_result(MaxPostingSizePart + 1024); + vector total_intersection_result(TotalMaxPostingSize + 1024); - void bitmapskippingtest(uint32_t BS, uint32_t th, BudgetedPostingCollector &uncompPosts, - const vector> &allPostIds) { - uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; - size_t MaxPostingSize(0); - SkippingHybM2 hybrid(MaxId, th, BS); - WallClockTimer z;// this is use only to time what we care - WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about - // 1. Benchmark only compression - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - if (hybrid.hasBeenLoaded(id)) continue; - vector &onePost = uncompPosts.getOnePost(id); - z.reset(); - CompressedSizeDuringPacking += hybrid.load(id, onePost.data(), onePost.size()); - packTime += static_cast(z.split()); - size_t qty = onePost.size(); - packVolume += qty; - if (MaxPostingSize < qty) { - MaxPostingSize = qty; - } - } - } - CoarsePackTime += static_cast(coarsez.split()); - vector intersection_result(MaxPostingSize + 1024); - // testing round - for (vector qids : allPostIds) { - size_t sizeout = intersection_result.size(); - hybrid.intersect(qids, intersection_result.data(), sizeout); - vector trueintersection = uncompPosts.computeIntersection(qids, onesidedgallopingintersection); - if (trueintersection.size() != sizeout) throw runtime_error("not even same cardinality"); - for (uint32_t k = 0; k < sizeout; ++k) - if (trueintersection[k] != intersection_result[k]) throw runtime_error("intersection bug"); - } - - coarsez.reset(); - for (vector qids : allPostIds) { - SR.prepareQuery(); - size_t sizeout = intersection_result.size(); - unpackVolume += hybrid.intersect(qids, intersection_result.data(), sizeout); - SR.endQuery(sizeout, uncompPosts, qids); +#if 1 + // testing round, does the algorithm have errors? + for (vector qids : allPostIds) { + size_t total_sizeout = 0; + + for (int part = 0; part < NumberOfPartitions; ++part) { + size_t sizeout = intersection_result.size(); + hybridPart[part]->intersect(qids, intersection_result.data(), sizeout); + + for (size_t k = total_sizeout; k < total_sizeout + sizeout; ++k) + total_intersection_result[k] = + intersection_result[k - total_sizeout] + bounds[part]; + total_sizeout += sizeout; + } + + vector trueintersection = + uncompPosts.computeIntersection(qids, onesidedgallopingintersection); + if (trueintersection.size() != total_sizeout) { + stringstream err; + err << "Different cardinality, expected: " << trueintersection.size() + << " got: " << total_sizeout; + throw runtime_error(err.str()); + } + for (uint32_t k = 0; k < trueintersection.size(); ++k) { + if (trueintersection[k] != total_intersection_result[k]) { + throw runtime_error("intersection bug"); } - CoarseUnpackInterTime += static_cast(coarsez.split()); - - // Prevent from optimizing out - cout << "### Ignore: " - << FakeCheckSum(intersection_result.data(), - intersection_result.size()) << endl; + } } +#endif + coarsez.reset(); + + // 2. Finally benchmarking + for (vector qids : allPostIds) { + size_t total_sizeout = 0; + + SR.prepareQuery(); + for (int part = 0; part < NumberOfPartitions; ++part) { + size_t sizeout = 0; + + // Intersect will fail if there is an empty list + sizeout = intersection_result.size(); + unpackVolume += hybridPart[part]->intersect( + qids, intersection_result.data(), sizeout); + for (size_t k = total_sizeout; k < total_sizeout + sizeout; ++k) + total_intersection_result[k] = + intersection_result[k - total_sizeout] + bounds[part]; + total_sizeout += sizeout; + } + SR.endQuery(total_sizeout, uncompPosts, qids); + } + CoarseUnpackInterTime += static_cast(coarsez.split()); - void uncompressedbitmaptest(uint32_t th, BudgetedPostingCollector &uncompPosts, - const vector> &allPostIds) { - uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; - size_t MaxPostingSize(0); - UncompressedHybM2 hybrid(Inter, MaxId, th); - WallClockTimer z;// this is use only to time what we care - WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about - // 1. Benchmark only compression - for (const vector &qids : allPostIds) { - for (uint32_t id : qids) { - if (hybrid.hasBeenLoaded(id)) continue; - vector &onePost = uncompPosts.getOnePost(id); - z.reset(); - CompressedSizeDuringPacking += hybrid.load(id, onePost.data(), onePost.size()); - packTime += static_cast(z.split()); - size_t qty = onePost.size(); - packVolume += qty; - if (MaxPostingSize < qty) { - MaxPostingSize = qty; - } - } - } - CoarsePackTime += static_cast(coarsez.split()); - vector intersection_result(MaxPostingSize + 1024); - // testing round - for (vector qids : allPostIds) { - size_t sizeout = intersection_result.size(); - hybrid.intersect(qids, intersection_result.data(), sizeout); - vector trueintersection = uncompPosts.computeIntersection(qids, onesidedgallopingintersection); - if (trueintersection.size() != sizeout) throw runtime_error("not even same cardinality"); - for (uint32_t k = 0; k < sizeout; ++k) - if (trueintersection[k] != intersection_result[k]) throw runtime_error("intersection bug"); - } + // Prevent from optimizing out + cout << "### Ignore: " << FakeCheckSum(total_intersection_result.data(), + total_intersection_result.size()) + << endl; + } - coarsez.reset(); - for (vector qids : allPostIds) { - SR.prepareQuery(); - size_t sizeout = intersection_result.size(); - unpackVolume += hybrid.intersect(qids, intersection_result.data(), sizeout); - SR.endQuery(sizeout, uncompPosts, qids); + void bitmaptestnosplit(uint32_t th, BudgetedPostingCollector &uncompPosts, + const vector> &allPostIds) { + uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; + size_t MaxPostingSize(0); + vector recovbufferHybrid; + HybM2 hybrid(scheme, Inter, MaxId, recovbufferHybrid, th); + WallClockTimer z; // this is use only to time what we care + WallClockTimer coarsez; // for "coarse" timings, it includes many things we + // don't care about + // 1. Benchmark only compression + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + if (hybrid.hasBeenLoaded(id)) + continue; + vector &onePost = uncompPosts.getOnePost(id); + z.reset(); + CompressedSizeDuringPacking += + hybrid.load(id, onePost.data(), onePost.size()); + packTime += static_cast(z.split()); + size_t qty = onePost.size(); + packVolume += qty; + if (MaxPostingSize < qty) { + MaxPostingSize = qty; } - CoarseUnpackInterTime += static_cast(coarsez.split()); - - // Prevent from optimizing out - cout << "### Ignore: " - << FakeCheckSum(intersection_result.data(), - intersection_result.size()) << endl; + } + } + CoarsePackTime += static_cast(coarsez.split()); + vector intersection_result(MaxPostingSize + 1024); + if (MaxRecoveryBuffer < hybrid.sizeOfRecoveryBufferInWords()) + MaxRecoveryBuffer = hybrid.sizeOfRecoveryBufferInWords(); + // testing round + for (vector qids : allPostIds) { + size_t sizeout = intersection_result.size(); + hybrid.intersect(qids, intersection_result.data(), sizeout); + vector trueintersection = + uncompPosts.computeIntersection(qids, onesidedgallopingintersection); + if (trueintersection.size() != sizeout) + throw runtime_error("not even same cardinality"); + for (uint32_t k = 0; k < sizeout; ++k) + if (trueintersection[k] != intersection_result[k]) + throw runtime_error("intersection bug"); } + coarsez.reset(); + for (vector qids : allPostIds) { + SR.prepareQuery(); + size_t sizeout = intersection_result.size(); + unpackVolume += + hybrid.intersect(qids, intersection_result.data(), sizeout); + SR.endQuery(sizeout, uncompPosts, qids); + } + CoarseUnpackInterTime += static_cast(coarsez.split()); + // Prevent from optimizing out + cout << "### Ignore: " + << FakeCheckSum(intersection_result.data(), intersection_result.size()) + << endl; + } - /** - * This runs a a posting-list intersection tests without compression. - */ - void testUncompressed(BudgetedPostingCollector &uncompPosts, + void bitmapskippingtest(uint32_t BS, uint32_t th, + BudgetedPostingCollector &uncompPosts, const vector> &allPostIds) { - size_t MaxPostingSize = uncompPosts.findMaxPostingSize(allPostIds); - vector inter(MaxPostingSize); - WallClockTimer coarsez; - for (vector qids : allPostIds) { - SR.prepareQuery(); - if (qids.empty()) continue; // odd but could happen? - vector> sizeids; - for (uint32_t i : qids) { - sizeids.emplace_back(make_pair(uncompPosts.getOnePost(i).size(), i)); - } - sort(sizeids.begin(), sizeids.end()); - vector *answer = & uncompPosts.getOnePost(sizeids.front().second); - size_t intersize = answer->size(); - unpackVolume += intersize; - for (size_t k = 1; (intersize > 0) && (k < sizeids.size()); ++k) { - vector &nextone = uncompPosts.getOnePost(sizeids[k].second); - unpackVolume += nextone.size(); - intersize = Inter(answer->data(), intersize, - nextone.data(), nextone.size(), inter.data()); - answer = & inter; - } - SR.endQuery(intersize, uncompPosts, qids); + uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; + size_t MaxPostingSize(0); + SkippingHybM2 hybrid(MaxId, th, BS); + WallClockTimer z; // this is use only to time what we care + WallClockTimer coarsez; // for "coarse" timings, it includes many things we + // don't care about + // 1. Benchmark only compression + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + if (hybrid.hasBeenLoaded(id)) + continue; + vector &onePost = uncompPosts.getOnePost(id); + z.reset(); + CompressedSizeDuringPacking += + hybrid.load(id, onePost.data(), onePost.size()); + packTime += static_cast(z.split()); + size_t qty = onePost.size(); + packVolume += qty; + if (MaxPostingSize < qty) { + MaxPostingSize = qty; } - CoarseUnpackInterTime += static_cast(coarsez.split()); - // Prevent from optimizing out - cout << "### Ignore: " - << FakeCheckSum(inter.data(), inter.size()) << endl; + } + } + CoarsePackTime += static_cast(coarsez.split()); + vector intersection_result(MaxPostingSize + 1024); + // testing round + for (vector qids : allPostIds) { + size_t sizeout = intersection_result.size(); + hybrid.intersect(qids, intersection_result.data(), sizeout); + vector trueintersection = + uncompPosts.computeIntersection(qids, onesidedgallopingintersection); + if (trueintersection.size() != sizeout) + throw runtime_error("not even same cardinality"); + for (uint32_t k = 0; k < sizeout; ++k) + if (trueintersection[k] != intersection_result[k]) + throw runtime_error("intersection bug"); } + coarsez.reset(); + for (vector qids : allPostIds) { + SR.prepareQuery(); + size_t sizeout = intersection_result.size(); + unpackVolume += + hybrid.intersect(qids, intersection_result.data(), sizeout); + SR.endQuery(sizeout, uncompPosts, qids); + } + CoarseUnpackInterTime += static_cast(coarsez.split()); - void printNumbers(bool detailed) const { - cout << endl; - if (! detailed) { - cout << "# coarse total speed (mis) + avg (ms/query) + median (ms/query) + 90perc (ms/query)" - << endl; - cout << setw(10) << setprecision(4) - << static_cast(unpackVolume) - / (CoarseUnpackInterTime) ; - cout << setw(10) << setprecision(4) - << SR.averageTimeInMS(); - cout << setw(10) << setprecision(4) - << SR.medianTimeInMS(); - cout << setw(10) << setprecision(4) - << SR.ninetypercentileTimeInMS() << endl; - - return; + // Prevent from optimizing out + cout << "### Ignore: " + << FakeCheckSum(intersection_result.data(), intersection_result.size()) + << endl; + } + + void uncompressedbitmaptest(uint32_t th, + BudgetedPostingCollector &uncompPosts, + const vector> &allPostIds) { + uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; + size_t MaxPostingSize(0); + UncompressedHybM2 hybrid(Inter, MaxId, th); + WallClockTimer z; // this is use only to time what we care + WallClockTimer coarsez; // for "coarse" timings, it includes many things we + // don't care about + // 1. Benchmark only compression + for (const vector &qids : allPostIds) { + for (uint32_t id : qids) { + if (hybrid.hasBeenLoaded(id)) + continue; + vector &onePost = uncompPosts.getOnePost(id); + z.reset(); + CompressedSizeDuringPacking += + hybrid.load(id, onePost.data(), onePost.size()); + packTime += static_cast(z.split()); + size_t qty = onePost.size(); + packVolume += qty; + if (MaxPostingSize < qty) { + MaxPostingSize = qty; } - cout << "# max recovery buffer = " << setprecision(4) << (static_cast(MaxRecoveryBuffer * sizeof(uint32_t)) / - (1024.0 * 1024.0)) << "MB" << endl; - - if (packVolume > 0) { - cout << "# compression: number of integers + bits/int + speed (mis) + coarse speed (mis) " << endl; - cout << setw(20) << packVolume << setw(20) << setprecision(4) << - static_cast(CompressedSizeDuringPacking * sizeof(uint32_t)) * 8.0 / static_cast(packVolume) - << setw(20) << setprecision(4) << static_cast(packVolume) / packTime - << setw(20) << setprecision(4) << static_cast(packVolume) / CoarsePackTime - << endl; - }; - if (unpackVolume > 0) { - cout - << "# decompression: number of integers + "; - if (unpackTime > 0) - cout << " decompression speed (mis) +"; - if (interTime > 0) - cout << " intersection speed (mis) +"; - if ((unpackTime + interTime) > 0) - cout << " total speed (mis) + "; - cout << "coarse total speed (mis) +"; - cout << "avg (ms/query) + median (ms/query) + 90perc (ms/query)" - << endl; - cout << setw(10) << unpackVolume; - if (unpackTime > 0) - cout << setw(10) << setprecision(4) - << static_cast(unpackVolume) / unpackTime; - if (interTime > 0) - cout << setw(10) << setprecision(4) - << static_cast(unpackVolume) / interTime; - if ((unpackTime + interTime) > 0) - cout << setw(10) << setprecision(4) - << static_cast(unpackVolume) / (unpackTime - + interTime); - cout << setw(10) << setprecision(4) - << static_cast(unpackVolume) - / (CoarseUnpackInterTime); - cout << setw(10) << setprecision(4) - << SR.averageTimeInMS(); - cout << setw(10) << setprecision(4) - << SR.medianTimeInMS(); - cout << setw(10) << setprecision(4) - << SR.ninetypercentileTimeInMS() << endl; - }; - - cout << endl; + } + } + CoarsePackTime += static_cast(coarsez.split()); + vector intersection_result(MaxPostingSize + 1024); + // testing round + for (vector qids : allPostIds) { + size_t sizeout = intersection_result.size(); + hybrid.intersect(qids, intersection_result.data(), sizeout); + vector trueintersection = + uncompPosts.computeIntersection(qids, onesidedgallopingintersection); + if (trueintersection.size() != sizeout) + throw runtime_error("not even same cardinality"); + for (uint32_t k = 0; k < sizeout; ++k) + if (trueintersection[k] != intersection_result[k]) + throw runtime_error("intersection bug"); } - StatisticsRecorder &getStatisticsRecorder() {return SR;} + coarsez.reset(); + for (vector qids : allPostIds) { + SR.prepareQuery(); + size_t sizeout = intersection_result.size(); + unpackVolume += + hybrid.intersect(qids, intersection_result.data(), sizeout); + SR.endQuery(sizeout, uncompPosts, qids); + } + CoarseUnpackInterTime += static_cast(coarsez.split()); + + // Prevent from optimizing out + cout << "### Ignore: " + << FakeCheckSum(intersection_result.data(), intersection_result.size()) + << endl; + } + + /** + * This runs a a posting-list intersection tests without compression. + */ + void testUncompressed(BudgetedPostingCollector &uncompPosts, + const vector> &allPostIds) { + size_t MaxPostingSize = uncompPosts.findMaxPostingSize(allPostIds); + vector inter(MaxPostingSize); + WallClockTimer coarsez; + for (vector qids : allPostIds) { + SR.prepareQuery(); + if (qids.empty()) + continue; // odd but could happen? + vector> sizeids; + for (uint32_t i : qids) { + sizeids.emplace_back(make_pair(uncompPosts.getOnePost(i).size(), i)); + } + sort(sizeids.begin(), sizeids.end()); + vector *answer = + &uncompPosts.getOnePost(sizeids.front().second); + size_t intersize = answer->size(); + unpackVolume += intersize; + for (size_t k = 1; (intersize > 0) && (k < sizeids.size()); ++k) { + vector &nextone = uncompPosts.getOnePost(sizeids[k].second); + unpackVolume += nextone.size(); + intersize = Inter(answer->data(), intersize, nextone.data(), + nextone.size(), inter.data()); + answer = &inter; + } + SR.endQuery(intersize, uncompPosts, qids); + } + CoarseUnpackInterTime += static_cast(coarsez.split()); + // Prevent from optimizing out + cout << "### Ignore: " << FakeCheckSum(inter.data(), inter.size()) << endl; + } + + void printNumbers(bool detailed) const { + cout << endl; + if (!detailed) { + cout << "# coarse total speed (mis) + avg (ms/query) + median " + "(ms/query) + 90perc (ms/query)" + << endl; + cout << setw(10) << setprecision(4) + << static_cast(unpackVolume) / (CoarseUnpackInterTime); + cout << setw(10) << setprecision(4) << SR.averageTimeInMS(); + cout << setw(10) << setprecision(4) << SR.medianTimeInMS(); + cout << setw(10) << setprecision(4) << SR.ninetypercentileTimeInMS() + << endl; + + return; + } + cout << "# max recovery buffer = " << setprecision(4) + << (static_cast(MaxRecoveryBuffer * sizeof(uint32_t)) / + (1024.0 * 1024.0)) + << "MB" << endl; + + if (packVolume > 0) { + cout << "# compression: number of integers + bits/int + speed (mis) + " + "coarse speed (mis) " + << endl; + cout << setw(20) << packVolume << setw(20) << setprecision(4) + << static_cast(CompressedSizeDuringPacking * + sizeof(uint32_t)) * + 8.0 / static_cast(packVolume) + << setw(20) << setprecision(4) + << static_cast(packVolume) / packTime << setw(20) + << setprecision(4) + << static_cast(packVolume) / CoarsePackTime << endl; + }; + if (unpackVolume > 0) { + cout << "# decompression: number of integers + "; + if (unpackTime > 0) + cout << " decompression speed (mis) +"; + if (interTime > 0) + cout << " intersection speed (mis) +"; + if ((unpackTime + interTime) > 0) + cout << " total speed (mis) + "; + cout << "coarse total speed (mis) +"; + cout << "avg (ms/query) + median (ms/query) + 90perc (ms/query)" << endl; + cout << setw(10) << unpackVolume; + if (unpackTime > 0) + cout << setw(10) << setprecision(4) + << static_cast(unpackVolume) / unpackTime; + if (interTime > 0) + cout << setw(10) << setprecision(4) + << static_cast(unpackVolume) / interTime; + if ((unpackTime + interTime) > 0) + cout << setw(10) << setprecision(4) + << static_cast(unpackVolume) / (unpackTime + interTime); + cout << setw(10) << setprecision(4) + << static_cast(unpackVolume) / (CoarseUnpackInterTime); + cout << setw(10) << setprecision(4) << SR.averageTimeInMS(); + cout << setw(10) << setprecision(4) << SR.medianTimeInMS(); + cout << setw(10) << setprecision(4) << SR.ninetypercentileTimeInMS() + << endl; + }; + + cout << endl; + } + + StatisticsRecorder &getStatisticsRecorder() { return SR; } private: - IntegerCODEC &scheme; - size_t CompressedSizeDuringPacking; - size_t packVolume; - size_t unpackVolume; - double packTime; - double unpackTime; - double interTime; - intersectionfunction Inter; - double CoarsePackTime; - double CoarseUnpackInterTime; - const bool modifiesinput;// whether codec modifies the input during compression - size_t MaxRecoveryBuffer; - StatisticsRecorder SR; + IntegerCODEC &scheme; + size_t CompressedSizeDuringPacking; + size_t packVolume; + size_t unpackVolume; + double packTime; + double unpackTime; + double interTime; + intersectionfunction Inter; + double CoarsePackTime; + double CoarseUnpackInterTime; + const bool + modifiesinput; // whether codec modifies the input during compression + size_t MaxRecoveryBuffer; + StatisticsRecorder SR; }; int main(int argc, char **argv) { - std::string InterName = "galloping"; - bool bIncludeOnePostQuery = false; - bool quiet = false; - size_t maxLinesToProcess = numeric_limits::max(); - - bool useCompression = false; - int SkipLog = 0; - int th = -1; - - string scheme = "copy"; - int partitions = 1; - bool dumpcompletestats = false; - - size_t memBudget = 1024ULL * 1024 * 1024 * 4; // 4GB seems like a better default than 16GB - - int c; - while ((c = getopt(argc, argv, "i:os:b:hl:p:qk:B:d")) != -1) { - switch (c) { - case 'd': - dumpcompletestats = true; - break; - case 'i': - InterName = optarg; - if (! IntersectionFactory::valid(InterName)) { - cerr << "I don't recognize the intersection scheme '" << InterName << "' " << endl; - printusage(argv[0]); - return -1; - } - break; - case 'B': - th = atoi(optarg); - if (th < 0) { - cerr << "th param needs to be within [0,infty)." << endl; - printusage(argv[0]); - return -1; - } - break; - case 'k': - SkipLog = atoi(optarg); - if ((SkipLog < 1) || (SkipLog > 31)) { - cerr << "Skip param needs to be within [1,31]." << endl; - printusage(argv[0]); - return -1; - } - break; - case 'p': - partitions = atoi(optarg); - if (partitions < 0) { - printusage(argv[0]); - return -1; - } - break; - case 'q': - quiet = true; - break; - case 'o': - bIncludeOnePostQuery = true; - break; - case 's': - scheme = optarg; - useCompression = true; - break; - case 'b': - memBudget = 1024ULL * 1024 * 1024 * atol(optarg); - break; - case 'l': - maxLinesToProcess = atol(optarg); - break; - case 'h': - printusage(argv[0]); - return 0; - default: - printusage(argv[0]); - return -1; - } - } - if (optind + 1 >= argc) { + std::string InterName = "galloping"; + bool bIncludeOnePostQuery = false; + bool quiet = false; + size_t maxLinesToProcess = numeric_limits::max(); + + bool useCompression = false; + int SkipLog = 0; + int th = -1; + + string scheme = "copy"; + int partitions = 1; + bool dumpcompletestats = false; + + size_t memBudget = + 1024ULL * 1024 * 1024 * 4; // 4GB seems like a better default than 16GB + + int c; + while ((c = getopt(argc, argv, "i:os:b:hl:p:qk:B:d")) != -1) { + switch (c) { + case 'd': + dumpcompletestats = true; + break; + case 'i': + InterName = optarg; + if (!IntersectionFactory::valid(InterName)) { + cerr << "I don't recognize the intersection scheme '" << InterName + << "' " << endl; printusage(argv[0]); return -1; - } - if (!CODECFactory::valid(scheme)) { - cout << "Compression scheme " << scheme << " is unknown. Try one of these: " << endl; - for (string n : CODECFactory::allNames()) - cout << n << endl; + } + break; + case 'B': + th = atoi(optarg); + if (th < 0) { + cerr << "th param needs to be within [0,infty)." << endl; + printusage(argv[0]); return -1; - } - if ((SkipLog > 0) && (useCompression)) { - cout << "conflicting options. You cannot mix skipping and compression." << endl; + } + break; + case 'k': + SkipLog = atoi(optarg); + if ((SkipLog < 1) || (SkipLog > 31)) { + cerr << "Skip param needs to be within [1,31]." << endl; printusage(argv[0]); return -1; + } + break; + case 'p': + partitions = atoi(optarg); + if (partitions < 0) { + printusage(argv[0]); + return -1; + } + break; + case 'q': + quiet = true; + break; + case 'o': + bIncludeOnePostQuery = true; + break; + case 's': + scheme = optarg; + useCompression = true; + break; + case 'b': + memBudget = 1024ULL * 1024 * 1024 * atol(optarg); + break; + case 'l': + maxLinesToProcess = atol(optarg); + break; + case 'h': + printusage(argv[0]); + return 0; + default: + printusage(argv[0]); + return -1; } - const bool modifiesinput = CODECFactory::modifiesInputDuringCompression(*CODECFactory::getFromName(scheme)); - - cout << "# Memory budget for uncompressed postings: " << setprecision(2) - << static_cast(memBudget) / 1024.0 / 1024.0 / 1024.0 << "GB" << endl; - cout << "# Maximum number of queries: " << maxLinesToProcess << endl; - if (SkipLog != 0) { - cout << "# testing skipping with block size: " << (1 << SkipLog) << endl; - if (th >= 0) { - cout << "# bitmap threshold: " << th << endl; - } - } else if (useCompression) { - cout << "# Compression scheme: " << scheme << endl; - cout << "# Does it modify inputs during compression? : " << (modifiesinput ? "yes" : "no") << endl; - cout << "# Intersection proc: " << InterName << endl; - if (th >= 0) { - if (th == 0) - cout << "# bitmap threshold: automatic (0)" << endl; - else - cout << "# bitmap threshold: " << th << endl; - } - if (partitions > 1) { - cout << "# number of partitions: " << partitions << endl; - } - } else { - cout << "# Compression is deactivated (use -s followed by scheme to enable)" << endl; - cout << "# Intersection proc: " << InterName << endl; - if (th >= 0) - cout << "# bitmap threshold: " << th << endl; - + } + if (optind + 1 >= argc) { + printusage(argv[0]); + return -1; + } + if (!CODECFactory::valid(scheme)) { + cout << "Compression scheme " << scheme + << " is unknown. Try one of these: " << endl; + for (string n : CODECFactory::allNames()) + cout << n << endl; + return -1; + } + if ((SkipLog > 0) && (useCompression)) { + cout << "conflicting options. You cannot mix skipping and compression." + << endl; + printusage(argv[0]); + return -1; + } + const bool modifiesinput = CODECFactory::modifiesInputDuringCompression( + *CODECFactory::getFromName(scheme)); + + cout << "# Memory budget for uncompressed postings: " << setprecision(2) + << static_cast(memBudget) / 1024.0 / 1024.0 / 1024.0 << "GB" + << endl; + cout << "# Maximum number of queries: " << maxLinesToProcess << endl; + if (SkipLog != 0) { + cout << "# testing skipping with block size: " << (1 << SkipLog) << endl; + if (th >= 0) { + cout << "# bitmap threshold: " << th << endl; } - cout << "# Do we include one-word queries: " << (bIncludeOnePostQuery ? "yes" : "no") << endl; - - cout << "# Quiet: " << (quiet ? "yes" : "no") << endl; - - - try { - string postFileName = argv[optind]; - string logFileName = argv[optind + 1]; - - ifstream logFile(logFileName.c_str()); - if (!logFile.is_open()) { - cerr << " Couldn't open query log file " << logFileName << endl; - printusage(argv[0]); - return -1; - } - cout << "# Parsing query file " << logFileName << endl; - - logFile.exceptions( - ios::badbit); // will throw an exception if something goes wrong, saves us the trouble of checking the IO status - - - TestHelper testBed(*CODECFactory::getFromName(scheme), IntersectionFactory::getFromName(InterName)); - - cout << "# Loading posting lists from " << postFileName << "... please be patient." << endl; - BudgetedPostingCollector uncompPosts(postFileName, memBudget); - cout << "# Found " << uncompPosts.getMaxPostQty() << " posting lists." << endl; - if (! quiet) cout << "# After each block, we output the *cumulative* results. Expect convergence. " << endl; - string line; - line.reserve(1024); - - vector> allPostIds; // this a buffer where queries are stored. - - size_t skippedQty = 0, lineQty = 0, randPickQty = 0; - - bool detailedDisplay = useCompression || (SkipLog != 0) || (th >= 0); - + } else if (useCompression) { + cout << "# Compression scheme: " << scheme << endl; + cout << "# Does it modify inputs during compression? : " + << (modifiesinput ? "yes" : "no") << endl; + cout << "# Intersection proc: " << InterName << endl; + if (th >= 0) { + if (th == 0) + cout << "# bitmap threshold: automatic (0)" << endl; + else + cout << "# bitmap threshold: " << th << endl; + } + if (partitions > 1) { + cout << "# number of partitions: " << partitions << endl; + } + } else { + cout << "# Compression is deactivated (use -s followed by scheme to enable)" + << endl; + cout << "# Intersection proc: " << InterName << endl; + if (th >= 0) + cout << "# bitmap threshold: " << th << endl; + } + cout << "# Do we include one-word queries: " + << (bIncludeOnePostQuery ? "yes" : "no") << endl; + + cout << "# Quiet: " << (quiet ? "yes" : "no") << endl; + + try { + string postFileName = argv[optind]; + string logFileName = argv[optind + 1]; + + ifstream logFile(logFileName.c_str()); + if (!logFile.is_open()) { + cerr << " Couldn't open query log file " << logFileName << endl; + printusage(argv[0]); + return -1; + } + cout << "# Parsing query file " << logFileName << endl; - vector oneQueryPostIds; // buffer where a single query is stored - for (; (static_cast(lineQty - skippedQty) < maxLinesToProcess) && logFile - && getline(logFile, line); ++lineQty) { - stringstream lineStr(line); - oneQueryPostIds.clear(); + logFile.exceptions(ios::badbit); // will throw an exception if something + // goes wrong, saves us the trouble of + // checking the IO status - { - uint32_t id; - while (lineStr >> id) { - if (uncompPosts.valid(id)) oneQueryPostIds.emplace_back(id); - } - } - if (oneQueryPostIds.empty() || - ((! bIncludeOnePostQuery) && (oneQueryPostIds.size() == 1)) - ) { - skippedQty++; - continue; - } - if (!uncompPosts.loadPostings(oneQueryPostIds)) {// we couldn't load them all in - if (SkipLog != 0) { - if (th >= 0) { - testBed.bitmapskippingtest(SkipLog, th, uncompPosts, allPostIds); - } else { - testBed.skippingtest(SkipLog, uncompPosts, allPostIds); - } - } else if (useCompression) { - if (th >= 0) - testBed.bitmaptest(partitions, th, uncompPosts, allPostIds); - else if (partitions > 1) - testBed.splittedtest(partitions, uncompPosts, allPostIds); - else - testBed.test(uncompPosts, allPostIds); - } else { - if (th >= 0) - testBed.uncompressedbitmaptest(th, uncompPosts, allPostIds); - else - testBed.testUncompressed(uncompPosts, allPostIds); - } - if (! quiet) cout << "# queries processed so far: " << (lineQty - skippedQty) << endl; - if (! quiet) testBed.printNumbers(detailedDisplay); - uncompPosts.clear(); - allPostIds.clear(); - assert(uncompPosts.getMemUsed() == 0); - if (!uncompPosts.loadPostings(oneQueryPostIds)) { - cerr << "Cannot load postings for the query '" << line - << "' after all postings are deleted. Perhaps, the memory budget is too small, or keepQtyPost is too large." << endl; - cerr << "Aborting!" << endl; - return -1; - } - } - allPostIds.emplace_back(oneQueryPostIds); + TestHelper testBed(*CODECFactory::getFromName(scheme), + IntersectionFactory::getFromName(InterName)); + cout << "# Loading posting lists from " << postFileName + << "... please be patient." << endl; + BudgetedPostingCollector uncompPosts(postFileName, memBudget); + cout << "# Found " << uncompPosts.getMaxPostQty() << " posting lists." + << endl; + if (!quiet) + cout << "# After each block, we output the *cumulative* results. Expect " + "convergence. " + << endl; + string line; + line.reserve(1024); + + vector> + allPostIds; // this a buffer where queries are stored. + + size_t skippedQty = 0, lineQty = 0, randPickQty = 0; + + bool detailedDisplay = useCompression || (SkipLog != 0) || (th >= 0); + + vector oneQueryPostIds; // buffer where a single query is stored + for (; (static_cast(lineQty - skippedQty) < maxLinesToProcess) && + logFile && getline(logFile, line); + ++lineQty) { + stringstream lineStr(line); + oneQueryPostIds.clear(); + + { + uint32_t id; + while (lineStr >> id) { + if (uncompPosts.valid(id)) + oneQueryPostIds.emplace_back(id); } - logFile.close();// we are done - - // final report - cout << "### FINAL REPORT: " << endl; + } + if (oneQueryPostIds.empty() || + ((!bIncludeOnePostQuery) && (oneQueryPostIds.size() == 1))) { + skippedQty++; + continue; + } + if (!uncompPosts.loadPostings( + oneQueryPostIds)) { // we couldn't load them all in if (SkipLog != 0) { - if (th >= 0) { - testBed.bitmapskippingtest(SkipLog, th, uncompPosts, allPostIds); - } else { - testBed.skippingtest(SkipLog, uncompPosts, allPostIds); - } + if (th >= 0) { + testBed.bitmapskippingtest(SkipLog, th, uncompPosts, allPostIds); + } else { + testBed.skippingtest(SkipLog, uncompPosts, allPostIds); + } } else if (useCompression) { - if (th >= 0) - testBed.bitmaptest(partitions, th, uncompPosts, allPostIds); - else if (partitions > 1) - testBed.splittedtest(partitions, uncompPosts, allPostIds); - else - testBed.test(uncompPosts, allPostIds); + if (th >= 0) + testBed.bitmaptest(partitions, th, uncompPosts, allPostIds); + else if (partitions > 1) + testBed.splittedtest(partitions, uncompPosts, allPostIds); + else + testBed.test(uncompPosts, allPostIds); } else { - if (th >= 0) - testBed.uncompressedbitmaptest(th, uncompPosts, allPostIds); - else - testBed.testUncompressed(uncompPosts, allPostIds); + if (th >= 0) + testBed.uncompressedbitmaptest(th, uncompPosts, allPostIds); + else + testBed.testUncompressed(uncompPosts, allPostIds); } - testBed.printNumbers(detailedDisplay); - - cout << "# Total lines: " << lineQty << - " processed: " << (lineQty - skippedQty) << - " skipped: " << skippedQty << " assigned randomly: " << randPickQty << endl; - if (dumpcompletestats) { - cout << "####### Complete stats:" << endl; - testBed.getStatisticsRecorder().output(cout); + if (!quiet) + cout << "# queries processed so far: " << (lineQty - skippedQty) + << endl; + if (!quiet) + testBed.printNumbers(detailedDisplay); + uncompPosts.clear(); + allPostIds.clear(); + assert(uncompPosts.getMemUsed() == 0); + if (!uncompPosts.loadPostings(oneQueryPostIds)) { + cerr << "Cannot load postings for the query '" << line + << "' after all postings are deleted. Perhaps, the memory " + "budget is too small, or keepQtyPost is too large." + << endl; + cerr << "Aborting!" << endl; + return -1; } + } + allPostIds.emplace_back(oneQueryPostIds); + } + logFile.close(); // we are done - } catch (const exception &e) { - cerr << "Run-time error: " << e.what() << endl; - return -1; - } catch (...) { - cerr << "Unknown exception caught, terminating..." << endl; - return -1; + // final report + cout << "### FINAL REPORT: " << endl; + if (SkipLog != 0) { + if (th >= 0) { + testBed.bitmapskippingtest(SkipLog, th, uncompPosts, allPostIds); + } else { + testBed.skippingtest(SkipLog, uncompPosts, allPostIds); + } + } else if (useCompression) { + if (th >= 0) + testBed.bitmaptest(partitions, th, uncompPosts, allPostIds); + else if (partitions > 1) + testBed.splittedtest(partitions, uncompPosts, allPostIds); + else + testBed.test(uncompPosts, allPostIds); + } else { + if (th >= 0) + testBed.uncompressedbitmaptest(th, uncompPosts, allPostIds); + else + testBed.testUncompressed(uncompPosts, allPostIds); } + testBed.printNumbers(detailedDisplay); - return 0; -} + cout << "# Total lines: " << lineQty + << " processed: " << (lineQty - skippedQty) + << " skipped: " << skippedQty << " assigned randomly: " << randPickQty + << endl; + if (dumpcompletestats) { + cout << "####### Complete stats:" << endl; + testBed.getStatisticsRecorder().output(cout); + } + } catch (const exception &e) { + cerr << "Run-time error: " << e.what() << endl; + return -1; + } catch (...) { + cerr << "Unknown exception caught, terminating..." << endl; + return -1; + } + return 0; +} diff --git a/advancedbenchmarking/src/compflatstat.cpp b/advancedbenchmarking/src/compflatstat.cpp index 408e9ff..fb2aa72 100644 --- a/advancedbenchmarking/src/compflatstat.cpp +++ b/advancedbenchmarking/src/compflatstat.cpp @@ -6,7 +6,6 @@ #include #include - #include "common.h" #include "util.h" #include "timer.h" @@ -15,46 +14,47 @@ using namespace SIMDCompressionLib; uint32_t FakeCheckSum(const uint32_t *p, size_t qty) { - return std::accumulate(p, p + qty, 0); + return std::accumulate(p, p + qty, 0); } int main(int argc, char **argv) { - if (argc != 2) { - cerr << "Usage: " << argv[0] << " " << endl; - return 1; - } - - try { - string postFileName = argv[1]; - - cout << "# Computing statistics of posting lists from " << postFileName << "... please be patient." << endl; + if (argc != 2) { + cerr << "Usage: " << argv[0] << " " << endl; + return 1; + } - size_t totalQty = 0; + try { + string postFileName = argv[1]; - MaropuGapReader grpRead(postFileName); + cout << "# Computing statistics of posting lists from " << postFileName + << "... please be patient." << endl; - if (!grpRead.open()) { - cerr << "Can't open '" << postFileName << "' for reading" << endl; - return 1; - } + size_t totalQty = 0; - size_t wordQty = 0; - off_t pos=0;uint32_t qty=0; - while (grpRead.readNextPosAndQty(pos, qty)) { - ++wordQty; - totalQty += qty; - } - cout << "Final stat, # of words: " << wordQty << " total # of postings: " << totalQty << endl; + MaropuGapReader grpRead(postFileName); - } catch (const exception &e) { - cerr << "Run-time error: " << e.what() << endl; - return -1; - } catch (...) { - cerr << "Unknown exception caught, terminating..." << endl; - return -1; + if (!grpRead.open()) { + cerr << "Can't open '" << postFileName << "' for reading" << endl; + return 1; } - return 0; + size_t wordQty = 0; + off_t pos = 0; + uint32_t qty = 0; + while (grpRead.readNextPosAndQty(pos, qty)) { + ++wordQty; + totalQty += qty; + } + cout << "Final stat, # of words: " << wordQty + << " total # of postings: " << totalQty << endl; + + } catch (const exception &e) { + cerr << "Run-time error: " << e.what() << endl; + return -1; + } catch (...) { + cerr << "Unknown exception caught, terminating..." << endl; + return -1; + } + + return 0; } - - diff --git a/advancedbenchmarking/src/compress.cpp b/advancedbenchmarking/src/compress.cpp index 4b07045..46fde35 100644 --- a/advancedbenchmarking/src/compress.cpp +++ b/advancedbenchmarking/src/compress.cpp @@ -16,129 +16,131 @@ using namespace SIMDCompressionLib; void printusage() { - cout << " Try ./compress -s nameofscheme input.bin output.bin" << endl; - + cout << " Try ./compress -s nameofscheme input.bin output.bin" << endl; } int main(int argc, char **argv) { - string scheme; - int c; - while ((c = getopt(argc, argv, "s:h")) != -1) - switch (c) { - case 's': - scheme = optarg; - break; - case 'h': - printusage(); - return 0; - default: - abort(); - } - if (optind + 1 >= argc) { - printusage(); - return -1; + string scheme; + int c; + while ((c = getopt(argc, argv, "s:h")) != -1) + switch (c) { + case 's': + scheme = optarg; + break; + case 'h': + printusage(); + return 0; + default: + abort(); + } + if (optind + 1 >= argc) { + printusage(); + return -1; + } + + try { + string ifilename = argv[optind]; + string ofilename = argv[optind + 1]; + + shared_ptr schemeptr = CODECFactory::getFromName(scheme); + if (schemeptr.get() == NULL) + return -1; + + MaropuGapReader reader(ifilename); + if (!reader.open()) { + cout << " could not open " << ifilename << " for reading..." << endl; + return -1; + } + FILE *fd = ::fopen(ofilename.c_str(), "wb"); + if (fd == NULL) { + cout << " could not open " << ofilename << " for writing..." << endl; + return -1; + } + cout << "Compressing content from " << ifilename << " to " << ofilename + << " using " << scheme << endl; + vector buffer; + + // write a format version number + uint32_t VERSION = 1; + if (fwrite(&VERSION, sizeof(VERSION), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + reader.close(); + return -1; + } + // store the scheme identifier + uint32_t schemesize = static_cast(scheme.size() * sizeof(char)); + if (fwrite(&schemesize, sizeof(schemesize), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + reader.close(); + return -1; } - try { - string ifilename = argv[optind]; - string ofilename = argv[optind + 1]; - - shared_ptr schemeptr = CODECFactory::getFromName(scheme); - if (schemeptr.get() == NULL) return -1; - - - MaropuGapReader reader(ifilename); - if (!reader.open()) { - cout << " could not open " << ifilename << " for reading..." << endl; - return -1; - } - FILE *fd = ::fopen(ofilename.c_str(), "wb"); - if (fd == NULL) { - cout << " could not open " << ofilename << " for writing..." << endl; - return -1; - } - cout << "Compressing content from " << ifilename << " to " << ofilename << " using " << scheme << endl; - vector buffer; - - // write a format version number - uint32_t VERSION = 1; - if (fwrite(&VERSION, sizeof(VERSION), 1, fd) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - reader.close(); - return -1; - } - // store the scheme identifier - uint32_t schemesize = static_cast(scheme.size() * sizeof(char)); - if (fwrite(&schemesize, sizeof(schemesize), 1, fd) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - reader.close(); - return -1; - } - - if (fwrite(scheme.c_str(), scheme.size() * sizeof(char), 1, fd) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - reader.close(); - return -1; - } - vector obuffer(1024); - size_t numberofarrays = 0; - size_t volume = 0; - size_t volumeout = 0; - WallClockTimer z; - while (reader.loadIntegers(buffer)) { - if (obuffer.size() < buffer.size() + 1024) { - obuffer.resize(buffer.size() + 1024); - } - size_t outsize = obuffer.size(); - schemeptr->encodeArray(buffer.data(), buffer.size(), obuffer.data(), outsize); - uint32_t osize = static_cast(outsize); - if (fwrite(&osize, sizeof(uint32_t), 1, fd) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - return -1; - } - uint32_t insize = static_cast(buffer.size()); - if (fwrite(&insize, sizeof(uint32_t), 1, fd) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - return -1; - } - if (fwrite(obuffer.data(), sizeof(uint32_t) * outsize, 1, fd) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - return -1; - } - volumeout += sizeof(osize) + sizeof(insize) + sizeof(uint32_t) * outsize; - volume += buffer.size(); - if (numberofarrays % 1000 == 0) { - cout << "."; - cout.flush(); - } - ++numberofarrays; - } - cout << endl; - long ti = z.split(); - cout << "Wrote " << numberofarrays << " arrays " << endl; - cout << "Compressed " << volume << " integers " << endl; - if (volume > 0) - cout << "Bits per int : " << (8.0 * static_cast(volumeout)) / static_cast(volume) << endl; - if (ti > 0) { - double speed = static_cast(volume) / static_cast(ti); - cout << " Speed: " << speed << " mis " << endl; - - } - - } catch (const exception &e) { - cerr << "Run-time error: " << e.what() << endl; + if (fwrite(scheme.c_str(), scheme.size() * sizeof(char), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + reader.close(); + return -1; + } + vector obuffer(1024); + size_t numberofarrays = 0; + size_t volume = 0; + size_t volumeout = 0; + WallClockTimer z; + while (reader.loadIntegers(buffer)) { + if (obuffer.size() < buffer.size() + 1024) { + obuffer.resize(buffer.size() + 1024); + } + size_t outsize = obuffer.size(); + schemeptr->encodeArray(buffer.data(), buffer.size(), obuffer.data(), + outsize); + uint32_t osize = static_cast(outsize); + if (fwrite(&osize, sizeof(uint32_t), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); return -1; - } catch (...) { - cerr << "Unknown exception caught, terminating..." << endl; + } + uint32_t insize = static_cast(buffer.size()); + if (fwrite(&insize, sizeof(uint32_t), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); return -1; + } + if (fwrite(obuffer.data(), sizeof(uint32_t) * outsize, 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + return -1; + } + volumeout += sizeof(osize) + sizeof(insize) + sizeof(uint32_t) * outsize; + volume += buffer.size(); + if (numberofarrays % 1000 == 0) { + cout << "."; + cout.flush(); + } + ++numberofarrays; + } + cout << endl; + long ti = z.split(); + cout << "Wrote " << numberofarrays << " arrays " << endl; + cout << "Compressed " << volume << " integers " << endl; + if (volume > 0) + cout << "Bits per int : " + << (8.0 * static_cast(volumeout)) / + static_cast(volume) + << endl; + if (ti > 0) { + double speed = static_cast(volume) / static_cast(ti); + cout << " Speed: " << speed << " mis " << endl; } - return 0; -} + } catch (const exception &e) { + cerr << "Run-time error: " << e.what() << endl; + return -1; + } catch (...) { + cerr << "Unknown exception caught, terminating..." << endl; + return -1; + } + return 0; +} diff --git a/advancedbenchmarking/src/entropy.cpp b/advancedbenchmarking/src/entropy.cpp index 106b766..9f99fe5 100644 --- a/advancedbenchmarking/src/entropy.cpp +++ b/advancedbenchmarking/src/entropy.cpp @@ -14,163 +14,160 @@ #include "delta.h" #include "budgetedpostingcollector.h" - using namespace SIMDCompressionLib; void message(const char *prog) { - cerr << " usage : " << prog << " maropubinaryfile querylogfile" - << endl; + cerr << " usage : " << prog << " maropubinaryfile querylogfile" << endl; } class EntropyRecorder { public: - EntropyRecorder() : - counter(), totallength(0) { - } - - void clear() { - counter.clear(); - totallength = 0; - } - void eat(const uint32_t *in, const size_t length) { - if (length == 0) - return; - totallength += length; - for (uint32_t k = 0; k < length; ++k, ++in) { - maptype::iterator i = counter.find(*in); - if (i != counter.end()) - i->second += 1; - else - counter[*in] = 1; - } + EntropyRecorder() : counter(), totallength(0) {} + + void clear() { + counter.clear(); + totallength = 0; + } + void eat(const uint32_t *in, const size_t length) { + if (length == 0) + return; + totallength += length; + for (uint32_t k = 0; k < length; ++k, ++in) { + maptype::iterator i = counter.find(*in); + if (i != counter.end()) + i->second += 1; + else + counter[*in] = 1; } - - double computeShannon() { - double total = 0; - for (maptype::iterator i = counter.begin(); i - != counter.end(); ++i) { - const double x = static_cast(i->second); - total += x / static_cast(totallength) * log(static_cast(totallength) / x) / log(2.0); - } - return total; + } + + double computeShannon() { + double total = 0; + for (maptype::iterator i = counter.begin(); i != counter.end(); ++i) { + const double x = static_cast(i->second); + total += x / static_cast(totallength) * + log(static_cast(totallength) / x) / log(2.0); } - - __attribute__((pure)) - double computeDataBits() { - double total = 0; - for (maptype::const_iterator i = counter.begin(); i - != counter.end(); ++i) { - total += static_cast(i->second) / static_cast(totallength) * static_cast(gccbits(i->first)); - } - return total; + return total; + } + + __attribute__((pure)) double computeDataBits() { + double total = 0; + for (maptype::const_iterator i = counter.begin(); i != counter.end(); ++i) { + total += static_cast(i->second) / + static_cast(totallength) * + static_cast(gccbits(i->first)); } - typedef unordered_map maptype; - maptype counter; - size_t totallength; + return total; + } + typedef unordered_map maptype; + maptype counter; + size_t totallength; }; - int main(int argc, char **argv) { - size_t maxLinesToProcess = numeric_limits::max(); - - size_t memBudget = 1024ULL * 1024 * 1024 * 2; // 2GB ought to be enough - bool bIncludeOnePostQuery = false; - - int c; - while ((c = getopt(argc, argv, "ob:l:h")) != -1) { - switch (c) { - case 'o': - bIncludeOnePostQuery = true; - break; - case 'b': - memBudget = 1024ULL * 1024 * 1024 * atol(optarg); - break; - case 'l': - maxLinesToProcess = atol(optarg); - break; - case 'h': - message(argv[0]); - return 0; - default: - message(argv[0]); - return -1; - } + size_t maxLinesToProcess = numeric_limits::max(); + + size_t memBudget = 1024ULL * 1024 * 1024 * 2; // 2GB ought to be enough + bool bIncludeOnePostQuery = false; + + int c; + while ((c = getopt(argc, argv, "ob:l:h")) != -1) { + switch (c) { + case 'o': + bIncludeOnePostQuery = true; + break; + case 'b': + memBudget = 1024ULL * 1024 * 1024 * atol(optarg); + break; + case 'l': + maxLinesToProcess = atol(optarg); + break; + case 'h': + message(argv[0]); + return 0; + default: + message(argv[0]); + return -1; } - - if (optind + 1 >= argc) { - message(argv[0]); - return -1; + } + + if (optind + 1 >= argc) { + message(argv[0]); + return -1; + } + cout << "# Memory budget for uncompressed postings: " << setprecision(2) + << static_cast(memBudget) / 1024.0 / 1024.0 / 1024.0 << "GB" + << endl; + cout << "# Maximum number of queries: " << maxLinesToProcess << endl; + cout << "# Do we include one-word queries: " + << (bIncludeOnePostQuery ? "yes" : "no") << endl; + string postFileName = argv[optind]; + string logFileName = argv[optind + 1]; + ifstream logFile(logFileName.c_str()); + if (!logFile.is_open()) { + cerr << " Couldn't open query log file " << logFileName << endl; + message(argv[0]); + return -1; + } + cout << "# Parsing query file " << logFileName << endl; + + logFile.exceptions(ios::badbit); // will throw an exception if something goes + // wrong, saves us the trouble of checking + // the IO status + + EntropyRecorder er; + cout << "# Loading posting lists from " << postFileName + << "... please be patient." << endl; + BudgetedPostingCollector uncompPosts(postFileName, memBudget); + cout << "# Found " << uncompPosts.getMaxPostQty() << " posting lists." + << endl; + string line; + size_t skippedQty = 0, lineQty = 0; + + vector oneQueryPostIds; // buffer where a single query is stored + + for (; (static_cast(lineQty - skippedQty) < maxLinesToProcess) && + logFile && getline(logFile, line); + ++lineQty) { + stringstream lineStr(line); + + oneQueryPostIds.clear(); + + { + uint32_t id; + while (lineStr >> id) { + if (uncompPosts.valid(id)) + oneQueryPostIds.emplace_back(id); + } + } + if (oneQueryPostIds.empty() || + ((!bIncludeOnePostQuery) && (oneQueryPostIds.size() == 1))) { + skippedQty++; + continue; } - cout << "# Memory budget for uncompressed postings: " << setprecision(2) - << static_cast(memBudget) / 1024.0 / 1024.0 / 1024.0 << "GB" << endl; - cout << "# Maximum number of queries: " << maxLinesToProcess << endl; - cout << "# Do we include one-word queries: " << (bIncludeOnePostQuery ? "yes" : "no") << endl; - string postFileName = argv[optind]; - string logFileName = argv[optind + 1]; - ifstream logFile(logFileName.c_str()); - if (!logFile.is_open()) { - cerr << " Couldn't open query log file " << logFileName << endl; - message(argv[0]); + if (!uncompPosts.loadPostings( + oneQueryPostIds)) { // we couldn't load them all in + uncompPosts.clear(); + assert(uncompPosts.getMemUsed() == 0); + if (!uncompPosts.loadPostings(oneQueryPostIds)) { + cerr << "Cannot load postings for the query '" << line + << "' after all postings are deleted. Perhaps, the memory budget " + "is too small." + << endl; + cerr << "Aborting!" << endl; return -1; + } } - cout << "# Parsing query file " << logFileName << endl; - - logFile.exceptions( - ios::badbit); // will throw an exception if something goes wrong, saves us the trouble of checking the IO status - - EntropyRecorder er; - cout << "# Loading posting lists from " << postFileName << "... please be patient." << endl; - BudgetedPostingCollector uncompPosts(postFileName, memBudget); - cout << "# Found " << uncompPosts.getMaxPostQty() << " posting lists." << endl; - string line; - size_t skippedQty = 0, lineQty = 0; - - vector oneQueryPostIds; // buffer where a single query is stored - - for (; (static_cast(lineQty - skippedQty) < maxLinesToProcess) && logFile - && getline(logFile, line); ++lineQty) { - stringstream lineStr(line); - - oneQueryPostIds.clear(); - - { - uint32_t id; - while (lineStr >> id) { - if (uncompPosts.valid(id)) oneQueryPostIds.emplace_back(id); - } - } - if (oneQueryPostIds.empty() || - ((! bIncludeOnePostQuery) && (oneQueryPostIds.size() == 1)) - ) { - skippedQty++; - continue; - } - if (!uncompPosts.loadPostings(oneQueryPostIds)) {// we couldn't load them all in - uncompPosts.clear(); - assert(uncompPosts.getMemUsed() == 0); - if (!uncompPosts.loadPostings(oneQueryPostIds)) { - cerr << "Cannot load postings for the query '" << line - << "' after all postings are deleted. Perhaps, the memory budget is too small." << endl; - cerr << "Aborting!" << endl; - return -1; - } - } - - for (uint32_t id : oneQueryPostIds) { - vector buffer = uncompPosts.getOnePost(id); - delta(0U, buffer.data(), buffer.size()); - er.eat(buffer.data(), buffer.size()); - } - + for (uint32_t id : oneQueryPostIds) { + vector buffer = uncompPosts.getOnePost(id); + delta(0U, buffer.data(), buffer.size()); + er.eat(buffer.data(), buffer.size()); } - logFile.close();// we are done - + } + logFile.close(); // we are done - - - cout << "# next line is shannon entropy and data bits" << endl; - cout << er.computeShannon() << "\t" << er.computeDataBits() << endl; + cout << "# next line is shannon entropy and data bits" << endl; + cout << er.computeShannon() << "\t" << er.computeDataBits() << endl; } - - diff --git a/advancedbenchmarking/src/ramtocache.cpp b/advancedbenchmarking/src/ramtocache.cpp index fad5e40..b79c5d7 100644 --- a/advancedbenchmarking/src/ramtocache.cpp +++ b/advancedbenchmarking/src/ramtocache.cpp @@ -13,7 +13,7 @@ #include "codecfactory.h" // https://code.google.com/p/likwid/wiki/LikwidPerfCtr#Using_the_marker_API -#ifdef LIKWID_MARKERS // see 'make likwidramtocache' for compiler flags +#ifdef LIKWID_MARKERS // see 'make likwidramtocache' for compiler flags #include #endif @@ -21,372 +21,395 @@ using namespace SIMDCompressionLib; void printusage(char *prog) { #ifdef LIKWID_MARKERS - cout << "example: likwid -m -C 1 -g BRANCH " << prog << " input.bin scheme1 scheme2 ..." << endl; - cout - << " You can use the -l flag to process only the first arrays" - << endl; + cout << "example: likwid -m -C 1 -g BRANCH " << prog + << " input.bin scheme1 scheme2 ..." << endl; + cout + << " You can use the -l flag to process only the first arrays" + << endl; #else - cout << " Try " << prog << " input.bin scheme1 scheme2 ..." << endl; - cout - << " You can use the -l flag to process only the first arrays" - << endl; + cout << " Try " << prog << " input.bin scheme1 scheme2 ..." << endl; + cout + << " You can use the -l flag to process only the first arrays" + << endl; #endif } struct stats { - stats() : - VolumeDecoded(0), BytesDecoded(0), TimeDecoding(0), NumberOfArrays( - 0) { - } - uint64_t VolumeDecoded; // how many ints we decoded - uint64_t BytesDecoded; // how many bytes these ints used - uint64_t TimeDecoding; // how much time it took - uint64_t NumberOfArrays; // how many arrays were compressed + stats() + : VolumeDecoded(0), BytesDecoded(0), TimeDecoding(0), NumberOfArrays(0) {} + uint64_t VolumeDecoded; // how many ints we decoded + uint64_t BytesDecoded; // how many bytes these ints used + uint64_t TimeDecoding; // how much time it took + uint64_t NumberOfArrays; // how many arrays were compressed }; -void simplecompress(shared_ptr c, vector & buffer, - vector & obuffer) { - obuffer.resize(buffer.size() * 2); // allocate lots of memory - size_t lo = obuffer.size() ; - c->encodeArray(buffer.data() , buffer.size(), obuffer.data() , lo); - obuffer.resize(lo); +void simplecompress(shared_ptr c, vector &buffer, + vector &obuffer) { + obuffer.resize(buffer.size() * 2); // allocate lots of memory + size_t lo = obuffer.size(); + c->encodeArray(buffer.data(), buffer.size(), obuffer.data(), lo); + obuffer.resize(lo); } +void blockedcompress(shared_ptr c, vector &buffer, + vector &obuffer, uint32_t blocksize) { + if (blocksize == 0) { + throw runtime_error("null blocksize...bug?"); + } + obuffer.resize(buffer.size() * 2); // allocate lots of memory + size_t inpos = 0; + size_t outcounter = 0; + size_t outpos = (buffer.size() + blocksize - 1) / blocksize; -void blockedcompress(shared_ptr c, vector & buffer, - vector & obuffer, uint32_t blocksize) { - if (blocksize == 0) { - throw runtime_error("null blocksize...bug?"); - } - obuffer.resize(buffer.size() * 2); // allocate lots of memory - size_t inpos = 0; - size_t outcounter = 0; - size_t outpos = (buffer.size() + blocksize - 1) / blocksize; - - while (inpos < buffer.size()) { - size_t l = buffer.size() - inpos; - if (l > blocksize) - l = blocksize; - size_t lo = obuffer.size() - outpos ; - c->encodeArray(buffer.data() + inpos, l, obuffer.data() + outpos, lo); - obuffer[outcounter ++] = lo;// saving compressed length - outpos += lo; - inpos += l; - } - obuffer.resize(outpos); - obuffer.shrink_to_fit(); + while (inpos < buffer.size()) { + size_t l = buffer.size() - inpos; + if (l > blocksize) + l = blocksize; + size_t lo = obuffer.size() - outpos; + c->encodeArray(buffer.data() + inpos, l, obuffer.data() + outpos, lo); + obuffer[outcounter++] = lo; // saving compressed length + outpos += lo; + inpos += l; + } + obuffer.resize(outpos); + obuffer.shrink_to_fit(); } - // alternative to processArray for sanity checking -void simpleProcessArray( - map,stats> & mystats, - vector> & mbuffer, vector>& mobuffer, uint32_t & bogus) { - vector l1buf; - WallClockTimer z; - mobuffer.resize(mbuffer.size()); - for (auto p = mystats.begin(); p != mystats.end(); ++p) { - shared_ptr codec = p->first; - stats & s = p->second; - for(size_t K = 0; K < mbuffer.size(); ++K) { - vector & buffer = mbuffer[K]; - if( l1buf.size() < buffer.size() ) l1buf.resize(buffer.size()); - vector & obuffer = mobuffer[K]; - vector bufcopy(buffer); // we make a copy so that no funny business can happen - simplecompress(codec,bufcopy,obuffer); - s.VolumeDecoded += buffer.size() ; - s.BytesDecoded += obuffer.size() * sizeof(uint32_t) ; - } - // now we benchmark - // we first materialize everything in a cache-friendly manner - size_t howmany = mbuffer.size(); - vector datap(howmany); - vector datal(howmany); - for (size_t K = 0; K < howmany; ++K) { - datap[K] = mobuffer[K].data(); - datal[K] = mobuffer[K].size(); - } +void simpleProcessArray(map, stats> &mystats, + vector> &mbuffer, + vector> &mobuffer, uint32_t &bogus) { + vector l1buf; + WallClockTimer z; + mobuffer.resize(mbuffer.size()); + for (auto p = mystats.begin(); p != mystats.end(); ++p) { + shared_ptr codec = p->first; + stats &s = p->second; + for (size_t K = 0; K < mbuffer.size(); ++K) { + vector &buffer = mbuffer[K]; + if (l1buf.size() < buffer.size()) + l1buf.resize(buffer.size()); + vector &obuffer = mobuffer[K]; + vector bufcopy( + buffer); // we make a copy so that no funny business can happen + simplecompress(codec, bufcopy, obuffer); + s.VolumeDecoded += buffer.size(); + s.BytesDecoded += obuffer.size() * sizeof(uint32_t); + } + // now we benchmark + // we first materialize everything in a cache-friendly manner + size_t howmany = mbuffer.size(); + vector datap(howmany); + vector datal(howmany); + for (size_t K = 0; K < howmany; ++K) { + datap[K] = mobuffer[K].data(); + datal[K] = mobuffer[K].size(); + } - z.reset(); - for (size_t K = 0; K < howmany; ++K) { - s.NumberOfArrays++; - size_t l = l1buf.size(); - codec->decodeArray(datap[K], datal[K], - l1buf.data(), l); - bogus += l1buf[0]; - } - uint64_t timespent = z.split(); - s.TimeDecoding += timespent; - } + z.reset(); + for (size_t K = 0; K < howmany; ++K) { + s.NumberOfArrays++; + size_t l = l1buf.size(); + codec->decodeArray(datap[K], datal[K], l1buf.data(), l); + bogus += l1buf[0]; + } + uint64_t timespent = z.split(); + s.TimeDecoding += timespent; + } } -void processArray( - map>, stats> & mystats, - vector> & mbuffer, vector>& mobuffer, uint32_t & bogus) { - vector l1buf; - WallClockTimer z; - size_t inpos,outpos,outcounter; - mobuffer.resize(mbuffer.size()); - for (auto p = mystats.begin(); p != mystats.end(); ++p) { - shared_ptr codec = p->first.second; - stats & s = p->second; - uint32_t blocksize = p->first.first; - l1buf.resize(blocksize); - for(size_t K = 0; K < mbuffer.size(); ++K) { - vector & buffer = mbuffer[K]; - vector & obuffer = mobuffer[K]; - vector bufcopy(buffer); // we make a copy so that no funny business can happen - blockedcompress(codec,bufcopy,obuffer,blocksize); - s.VolumeDecoded += buffer.size(); - s.BytesDecoded += obuffer.size() * sizeof(uint32_t); - // we first do it without recording results to make sure no funny buffering issues - // are at play and to check the answer - inpos = 0; - for (outpos = (buffer.size() + blocksize - 1) / blocksize, outcounter = - 0; outpos < obuffer.size(); outpos += - obuffer[outcounter++]) { - size_t l = blocksize; - codec->decodeArray(obuffer.data() + outpos, obuffer[outcounter], - l1buf.data(), l); - // we check that we decoded the right values - size_t expectedl = buffer.size() - inpos; - if (expectedl > blocksize) - expectedl = blocksize; - if (l != expectedl) - throw runtime_error("failed to recover right length"); - for (size_t k = 0; k < l; ++k) - if (l1buf[k] != buffer[k + inpos]) { - for (size_t K = 0; K < l; ++K) - cout << K << ": expected is " << buffer[K + inpos] - << ", actual is " << l1buf[K] << endl; - throw runtime_error("bug decoded values do not match"); - } - inpos += l; - } - } - // now we benchmark for real knowing that the result will be sound - size_t howmany = mbuffer.size(); - vector datap(howmany); - vector counts(howmany); - vector datal(howmany); - for (size_t K = 0; K < howmany; ++K) { - counts[K] = mobuffer[K].data(); - datap[K] = mobuffer[K].data() + (mbuffer[K].size()+ blocksize - 1) / blocksize; - datal[K] = mobuffer[K].size() - (mbuffer[K].size()+ blocksize - 1) / blocksize; - } - z.reset(); - for (size_t K = 0; K < howmany; ++K) { - uint32_t * obuffer = datap[K]; - uint32_t * counters = counts[K]; - size_t thiscompressedlength = datal[K]; - s.NumberOfArrays++; - for (outcounter = 0; outcounter < thiscompressedlength; ) { - size_t l = blocksize; - codec->decodeArray(obuffer + outcounter, *counters, - l1buf.data(), l); - bogus += l1buf[0]; - outcounter += *counters; - counters++; - } - if(thiscompressedlength != outcounter) cerr<<"bug?"<>, stats> &mystats, + vector> &mbuffer, + vector> &mobuffer, uint32_t &bogus) { + vector l1buf; + WallClockTimer z; + size_t inpos, outpos, outcounter; + mobuffer.resize(mbuffer.size()); + for (auto p = mystats.begin(); p != mystats.end(); ++p) { + shared_ptr codec = p->first.second; + stats &s = p->second; + uint32_t blocksize = p->first.first; + l1buf.resize(blocksize); + for (size_t K = 0; K < mbuffer.size(); ++K) { + vector &buffer = mbuffer[K]; + vector &obuffer = mobuffer[K]; + vector bufcopy( + buffer); // we make a copy so that no funny business can happen + blockedcompress(codec, bufcopy, obuffer, blocksize); + s.VolumeDecoded += buffer.size(); + s.BytesDecoded += obuffer.size() * sizeof(uint32_t); + // we first do it without recording results to make sure no funny + // buffering issues + // are at play and to check the answer + inpos = 0; + for (outpos = (buffer.size() + blocksize - 1) / blocksize, outcounter = 0; + outpos < obuffer.size(); outpos += obuffer[outcounter++]) { + size_t l = blocksize; + codec->decodeArray(obuffer.data() + outpos, obuffer[outcounter], + l1buf.data(), l); + // we check that we decoded the right values + size_t expectedl = buffer.size() - inpos; + if (expectedl > blocksize) + expectedl = blocksize; + if (l != expectedl) + throw runtime_error("failed to recover right length"); + for (size_t k = 0; k < l; ++k) + if (l1buf[k] != buffer[k + inpos]) { + for (size_t K = 0; K < l; ++K) + cout << K << ": expected is " << buffer[K + inpos] + << ", actual is " << l1buf[K] << endl; + throw runtime_error("bug decoded values do not match"); + } + inpos += l; + } + } + // now we benchmark for real knowing that the result will be sound + size_t howmany = mbuffer.size(); + vector datap(howmany); + vector counts(howmany); + vector datal(howmany); + for (size_t K = 0; K < howmany; ++K) { + counts[K] = mobuffer[K].data(); + datap[K] = + mobuffer[K].data() + (mbuffer[K].size() + blocksize - 1) / blocksize; + datal[K] = + mobuffer[K].size() - (mbuffer[K].size() + blocksize - 1) / blocksize; + } + z.reset(); + for (size_t K = 0; K < howmany; ++K) { + uint32_t *obuffer = datap[K]; + uint32_t *counters = counts[K]; + size_t thiscompressedlength = datal[K]; + s.NumberOfArrays++; + for (outcounter = 0; outcounter < thiscompressedlength;) { + size_t l = blocksize; + codec->decodeArray(obuffer + outcounter, *counters, l1buf.data(), l); + bogus += l1buf[0]; + outcounter += *counters; + counters++; + } + if (thiscompressedlength != outcounter) + cerr << "bug?" << endl; + } + uint64_t timespent = z.split(); + s.TimeDecoding += timespent; + } } -void distributionOfDeltas(const uint32_t * in, const size_t length, - std::map & counter) { - uint32_t prev = 0; - for (size_t k = 0; k < length; ++k, ++in) { - uint32_t delta = in[0] - prev; - prev = in[0]; - std::map::iterator i = counter.find(delta); - if (i != counter.end()) - i->second += 1; - else - counter[delta] = 1; - } +void distributionOfDeltas(const uint32_t *in, const size_t length, + std::map &counter) { + uint32_t prev = 0; + for (size_t k = 0; k < length; ++k, ++in) { + uint32_t delta = in[0] - prev; + prev = in[0]; + std::map::iterator i = counter.find(delta); + if (i != counter.end()) + i->second += 1; + else + counter[delta] = 1; + } } -double entropy(std::map & counter) { - double length = 0; - for (std::map::iterator i = counter.begin(); - i != counter.end(); ++i) { - length += i->second; - } - if (length < 1) - return 0; - double total = 0; - for (std::map::iterator i = counter.begin(); - i != counter.end(); ++i) { - double x = i->second; - if (x < 1) - continue; - total += x / length * log(length / x) / log(2.0); - } - return total; +double entropy(std::map &counter) { + double length = 0; + for (std::map::iterator i = counter.begin(); + i != counter.end(); ++i) { + length += i->second; + } + if (length < 1) + return 0; + double total = 0; + for (std::map::iterator i = counter.begin(); + i != counter.end(); ++i) { + double x = i->second; + if (x < 1) + continue; + total += x / length * log(length / x) / log(2.0); + } + return total; } int main(int argc, char **argv) { - vector blocksizes = {32,64,128,256,512,1024,2048, 4096, 8192, 16384, 32768, 65536,1048576}; - vector > schemes; - int c; - size_t MAX_ARRAYS = std::numeric_limits::max(); - size_t VOLUME = 1024;// 1024 MB or 1 GB - while ((c = getopt(argc, argv, "r:l:")) != -1) { - switch (c) { - case 'l': - MAX_ARRAYS = atoi(optarg); - if (MAX_ARRAYS < 1) { - cerr << " MAX_ARRAYS param needs to be within [1,infty)." << endl; - printusage(argv[0]); - return -1; - } - break; - case 'v': - VOLUME = atoi(optarg); - if (VOLUME < 1) { - cerr << " VOLUME param needs to be within [1,infty). (In MB.)" << endl; - printusage(argv[0]); - return -1; - } - break; - case 'h': - printusage(argv[0]); - return 0; - default: - printusage(argv[0]); - return -1; - } - } - if (optind + 2 > argc) { - printusage(argv[0]); - return -1; - } + vector blocksizes = {32, 64, 128, 256, 512, 1024, 2048, + 4096, 8192, 16384, 32768, 65536, 1048576}; + vector> schemes; + int c; + size_t MAX_ARRAYS = std::numeric_limits::max(); + size_t VOLUME = 1024; // 1024 MB or 1 GB + while ((c = getopt(argc, argv, "r:l:")) != -1) { + switch (c) { + case 'l': + MAX_ARRAYS = atoi(optarg); + if (MAX_ARRAYS < 1) { + cerr << " MAX_ARRAYS param needs to be within [1,infty)." << endl; + printusage(argv[0]); + return -1; + } + break; + case 'v': + VOLUME = atoi(optarg); + if (VOLUME < 1) { + cerr << " VOLUME param needs to be within [1,infty). (In MB.)" << endl; + printusage(argv[0]); + return -1; + } + break; + case 'h': + printusage(argv[0]); + return 0; + default: + printusage(argv[0]); + return -1; + } + } + if (optind + 2 > argc) { + printusage(argv[0]); + return -1; + } #ifdef LIKWID_MARKERS - char currentMarker[64]; - likwid_markerInit(); + char currentMarker[64]; + likwid_markerInit(); #endif - uint32_t bogus = 0; - map>,stats> mystats; - map,stats> mysimplestats; + uint32_t bogus = 0; + map>, stats> mystats; + map, stats> mysimplestats; - try { - string ifilename = argv[optind]; - for(int k = optind + 1; k < argc; ++k) { - shared_ptr s = CODECFactory::getFromName(argv[k]); - if (s.get() == NULL) return -1; - schemes.push_back(s); - for(size_t z = 0; z s = CODECFactory::getFromName(argv[k]); + if (s.get() == NULL) + return -1; + schemes.push_back(s); + for (size_t z = 0; z < blocksizes.size(); ++z) { + stats S; + mystats[make_pair(blocksizes[z], s)] = S; + } + stats Ss; + mysimplestats[s] = Ss; + } - MaropuGapReader reader(ifilename); - if (!reader.open()) { - cout << " could not open " << ifilename << " for reading..." << endl; - return -1; - } - cout << "# Compressing content from " << ifilename << " "; - cout.flush(); - vector> buffer; - vector> obuffer; - size_t numberofarrays = 0; - std::map counter; - vector b; - size_t howmanyints = 0; - size_t longestarray = 0; - size_t shortestarray = ~((size_t)0); - size_t buffermaxsize = 0; - size_t bufferminsize = ~((size_t)0); - size_t buffercurrentsize = 0; - while (reader.loadIntegers(b)) { - howmanyints += b.size(); - if(longestarray < b.size()) longestarray = b.size(); - if(shortestarray > b.size()) shortestarray = b.size(); - buffercurrentsize += b.size(); - distributionOfDeltas(b.data(), b.size(), counter); - buffer.push_back(b); - if(buffercurrentsize > VOLUME * 1024 * 1024 / 4) {// VOLUME is in MB - if(buffermaxsize < buffercurrentsize) buffermaxsize = buffercurrentsize; - if(bufferminsize > buffercurrentsize) bufferminsize = buffercurrentsize; - buffercurrentsize = 0; + MaropuGapReader reader(ifilename); + if (!reader.open()) { + cout << " could not open " << ifilename << " for reading..." << endl; + return -1; + } + cout << "# Compressing content from " << ifilename << " "; + cout.flush(); + vector> buffer; + vector> obuffer; + size_t numberofarrays = 0; + std::map counter; + vector b; + size_t howmanyints = 0; + size_t longestarray = 0; + size_t shortestarray = ~((size_t)0); + size_t buffermaxsize = 0; + size_t bufferminsize = ~((size_t)0); + size_t buffercurrentsize = 0; + while (reader.loadIntegers(b)) { + howmanyints += b.size(); + if (longestarray < b.size()) + longestarray = b.size(); + if (shortestarray > b.size()) + shortestarray = b.size(); + buffercurrentsize += b.size(); + distributionOfDeltas(b.data(), b.size(), counter); + buffer.push_back(b); + if (buffercurrentsize > VOLUME * 1024 * 1024 / 4) { // VOLUME is in MB + if (buffermaxsize < buffercurrentsize) + buffermaxsize = buffercurrentsize; + if (bufferminsize > buffercurrentsize) + bufferminsize = buffercurrentsize; + buffercurrentsize = 0; #ifdef LIKWID_MARKERS - snprintf(currentMarker, sizeof(currentMarker), "all"); - likwid_markerStartRegion(currentMarker); + snprintf(currentMarker, sizeof(currentMarker), "all"); + likwid_markerStartRegion(currentMarker); #endif - processArray(mystats,buffer,obuffer,bogus); - simpleProcessArray(mysimplestats,buffer,obuffer,bogus); + processArray(mystats, buffer, obuffer, bogus); + simpleProcessArray(mysimplestats, buffer, obuffer, bogus); #ifdef LIKWID_MARKERS - likwid_markerStopRegion(currentMarker); + likwid_markerStopRegion(currentMarker); #endif - buffer.clear(); - } - if (numberofarrays % 1000 == 0) { - cout << "."; - cout.flush(); - } - ++numberofarrays; - if(numberofarrays == MAX_ARRAYS) break; - } - if(buffer.size() > 0) { - if(buffermaxsize < buffercurrentsize) buffermaxsize = buffercurrentsize; - if(bufferminsize > buffercurrentsize) bufferminsize = buffercurrentsize; - buffercurrentsize = 0; + buffer.clear(); + } + if (numberofarrays % 1000 == 0) { + cout << "."; + cout.flush(); + } + ++numberofarrays; + if (numberofarrays == MAX_ARRAYS) + break; + } + if (buffer.size() > 0) { + if (buffermaxsize < buffercurrentsize) + buffermaxsize = buffercurrentsize; + if (bufferminsize > buffercurrentsize) + bufferminsize = buffercurrentsize; + buffercurrentsize = 0; #ifdef LIKWID_MARKERS - snprintf(currentMarker, sizeof(currentMarker), "all"); - likwid_markerStartRegion(currentMarker); + snprintf(currentMarker, sizeof(currentMarker), "all"); + likwid_markerStartRegion(currentMarker); #endif - processArray(mystats,buffer,obuffer,bogus); - simpleProcessArray(mysimplestats,buffer,obuffer,bogus); + processArray(mystats, buffer, obuffer, bogus); + simpleProcessArray(mysimplestats, buffer, obuffer, bogus); #ifdef LIKWID_MARKERS - likwid_markerStopRegion(currentMarker); + likwid_markerStopRegion(currentMarker); #endif - buffer.clear(); - } - cout << endl; - cout << "# Entropy of deltas " << entropy(counter) << endl; - cout << "# Processed " << numberofarrays << " arrays " << endl; - cout << "# Processed " << howmanyints << " integers " << endl; - cout << "# Average array length is " << howmanyints / numberofarrays << " integers " << endl; - cout << "# Smallest array has " << shortestarray << " integers " << endl; - cout << "# Longest array has " << longestarray << " integers " << endl; - cout << "# Smallest buffer size " << bufferminsize*4.0/(1024.0*1024) << " MB " << endl; - cout << "# Largest buffer size " << buffermaxsize*4.0/(1024.0*1024) << " MB " << endl; - cout<<"# in-memory volume tops at "<first.first; - shared_ptr codec = p->first.second; - stats & s = p->second; - if(s.NumberOfArrays != numberofarrays) throw runtime_error("bug"); - string codecname = CODECFactory::getName(*codec); - cout<(s.VolumeDecoded)/static_cast(s.TimeDecoding)) - <(s.BytesDecoded)*8.0/static_cast(s.VolumeDecoded)< codec = p->first; - stats & s = p->second; - string codecname = CODECFactory::getName(*codec); - cout<(s.VolumeDecoded)/static_cast(s.TimeDecoding)) - <(s.BytesDecoded)*8.0/static_cast(s.VolumeDecoded)<first.first; + shared_ptr codec = p->first.second; + stats &s = p->second; + if (s.NumberOfArrays != numberofarrays) + throw runtime_error("bug"); + string codecname = CODECFactory::getName(*codec); + cout << left << setw(10) << blocksize << setw(30) << left << codecname + << setw(20) << left << round(static_cast(s.VolumeDecoded) / + static_cast(s.TimeDecoding)) + << setw(20) << left + << static_cast(s.BytesDecoded) * 8.0 / + static_cast(s.VolumeDecoded) + << endl; + } + for (auto p = mysimplestats.begin(); p != mysimplestats.end(); ++p) { + shared_ptr codec = p->first; + stats &s = p->second; + string codecname = CODECFactory::getName(*codec); + cout << left << setw(10) << "-1" << setw(30) << left << codecname + << setw(20) << left << round(static_cast(s.VolumeDecoded) / + static_cast(s.TimeDecoding)) + << setw(20) << left + << static_cast(s.BytesDecoded) * 8.0 / + static_cast(s.VolumeDecoded) + << endl; + } + } catch (const exception &e) { + cerr << "Run-time error: " << e.what() << endl; + return -1; + } catch (...) { + cerr << "Unknown exception caught, terminating..." << endl; + return -1; + } #ifdef LIKWID_MARKERS - likwid_markerClose(); + likwid_markerClose(); #endif - return 0; + return 0; } - diff --git a/advancedbenchmarking/src/readflat.cpp b/advancedbenchmarking/src/readflat.cpp index 45b4733..7df0417 100644 --- a/advancedbenchmarking/src/readflat.cpp +++ b/advancedbenchmarking/src/readflat.cpp @@ -8,52 +8,48 @@ #include "maropuparser.h" #include "codecfactory.h" - using namespace SIMDCompressionLib; void printusage(char *prog) { - cout << "Usage: " << prog << " " << endl; - + cout << "Usage: " << prog << " " + << endl; } int main(int argc, char **argv) { - if (argc != 2) { - printusage(argv[1]); - return -1; + if (argc != 2) { + printusage(argv[1]); + return -1; + } + + try { + string postFileName = argv[1]; + + MaropuGapReader reader(postFileName); + if (!reader.open()) { + cout << " could not open " << postFileName << " for reading..." << endl; + return -1; } + off_t pos; + uint32_t qty; + uint32_t postId = 0; - try { - string postFileName = argv[1]; - - MaropuGapReader reader(postFileName); - if (!reader.open()) { - cout << " could not open " << postFileName << " for reading..." << endl; - return -1; - } - - off_t pos; - uint32_t qty; - uint32_t postId = 0; - - while (reader.readNextPosAndQty(pos, qty)) { - cout << "id: " << postId << " qty: " << qty << "offset: " << pos << endl; - postId++; - if ((reader.getPos() - pos) != (qty + 1) * 4) { - cerr << "Internal error: unpexected diff in offsets!" << endl; - return -1; - } - } - - } catch (const exception &e) { - cerr << "Run-time error: " << e.what() << endl; - return -1; - } catch (...) { - cerr << "Unknown exception caught, terminating..." << endl; + while (reader.readNextPosAndQty(pos, qty)) { + cout << "id: " << postId << " qty: " << qty << "offset: " << pos << endl; + postId++; + if ((reader.getPos() - pos) != (qty + 1) * 4) { + cerr << "Internal error: unpexected diff in offsets!" << endl; return -1; + } } - return 0; -} - + } catch (const exception &e) { + cerr << "Run-time error: " << e.what() << endl; + return -1; + } catch (...) { + cerr << "Unknown exception caught, terminating..." << endl; + return -1; + } + return 0; +} diff --git a/advancedbenchmarking/src/simplesynth.cpp b/advancedbenchmarking/src/simplesynth.cpp index ecde0f7..5ef740d 100644 --- a/advancedbenchmarking/src/simplesynth.cpp +++ b/advancedbenchmarking/src/simplesynth.cpp @@ -15,102 +15,102 @@ using namespace SIMDCompressionLib; void printusage() { - cout << "This generate a file containing arrays. Each array is " - "written as a sequence of 32-bit unsigned integers (sorted) " - "preceded by a length indicated as a 32-bit unsigned integer. " - << endl; - cout << "Flags: " << endl; - cout << "-u : uniform (as opposed to clustered) " << endl; - cout << "-N : length of arrays " << endl; - cout << "-m : how many arrays " << endl; - cout << "-R : range in bits (e.g., 31 means 31-bit integers) " << endl; - cout << " Try ./simplesynth -m 4214 -N 10000 test.bin" << endl; - + cout << "This generate a file containing arrays. Each array is " + "written as a sequence of 32-bit unsigned integers (sorted) " + "preceded by a length indicated as a 32-bit unsigned integer. " + << endl; + cout << "Flags: " << endl; + cout << "-u : uniform (as opposed to clustered) " << endl; + cout << "-N : length of arrays " << endl; + cout << "-m : how many arrays " << endl; + cout << "-R : range in bits (e.g., 31 means 31-bit integers) " << endl; + cout << " Try ./simplesynth -m 4214 -N 10000 test.bin" << endl; } int main(int argc, char **argv) { - size_t howmany = 1000; // default on 1000 arrays - uint32_t N = 1024; // default on arrays having 1024 ints - size_t range = 30; // default on arrays spanning 30 bits - bool cluster = true; // default on clustered data - ClusteredDataGenerator cdg; - UniformDataGenerator udg; + size_t howmany = 1000; // default on 1000 arrays + uint32_t N = 1024; // default on arrays having 1024 ints + size_t range = 30; // default on arrays spanning 30 bits + bool cluster = true; // default on clustered data + ClusteredDataGenerator cdg; + UniformDataGenerator udg; - int c; - while ((c = getopt(argc, argv, "huN:m:R:")) != -1) - switch (c) { - case 'u': - cluster = false; - break; - case 'N': - N = atoi(optarg); - if (N < 1) { - printusage(); - return -1; - } - break; - case 'h': - printusage(); - return 0; - case 'm': - howmany = atoi(optarg); - if (howmany < 1) { - printusage(); - return -1; - } - break; - case 'R': - range = atoi(optarg); - if ((range < 1) or (range >= 32)) { - printusage(); - return -1; - } - break; - default: - abort(); - } - if (optind >= argc) { + int c; + while ((c = getopt(argc, argv, "huN:m:R:")) != -1) + switch (c) { + case 'u': + cluster = false; + break; + case 'N': + N = atoi(optarg); + if (N < 1) { printusage(); return -1; - } - uint32_t Max = 1U << range; - if (N > Max) { + } + break; + case 'h': + printusage(); + return 0; + case 'm': + howmany = atoi(optarg); + if (howmany < 1) { printusage(); return -1; - } - string ofilename = argv[optind]; - FILE *fd = ::fopen(ofilename.c_str(), "wb"); - if (fd == NULL) { - cout << " could not open " << ofilename << " for writing..." << endl; + } + break; + case 'R': + range = atoi(optarg); + if ((range < 1) or (range >= 32)) { + printusage(); return -1; + } + break; + default: + abort(); + } + if (optind >= argc) { + printusage(); + return -1; + } + uint32_t Max = 1U << range; + if (N > Max) { + printusage(); + return -1; + } + string ofilename = argv[optind]; + FILE *fd = ::fopen(ofilename.c_str(), "wb"); + if (fd == NULL) { + cout << " could not open " << ofilename << " for writing..." << endl; + return -1; + } + cout << " generating " << (cluster ? "cluster" : "uniform") << " " << howmany + << " arrays of length = " << N << " over " << range << " bits to file " + << ofilename << endl; + WallClockTimer z; + for (size_t i = 0; i < howmany; ++i) { + vector array = + cluster ? cdg.generate(N, Max) : udg.generate(N, Max); + assert(array.size() == N); + if (fwrite(&N, sizeof(N), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + return -1; } - cout << " generating " << (cluster ? "cluster" : "uniform") << - " " << howmany << " arrays of length = " << N << - " over " << range << " bits to file " << ofilename << endl; - WallClockTimer z; - for (size_t i = 0; i < howmany; ++i) { - vector array = cluster ? cdg.generate(N, Max) : udg.generate(N, Max); - assert(array.size() == N); - if (fwrite(&N, sizeof(N), 1, fd) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - return -1; - } - if (fwrite(array.data(), sizeof(uint32_t) * N, 1, fd) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - return -1; - } - if (i % 1000 == 0) { - cout << "."; - cout.flush(); - } + if (fwrite(array.data(), sizeof(uint32_t) * N, 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + return -1; } - cout << endl; - long ti = z.split(); - cout << " Written " << howmany << " arrays of " << N << " integers" << endl; - if (ti > 0) { - double speed = static_cast(howmany * N) / static_cast(ti); - cout << " Speed: " << speed << " mis " << endl; + if (i % 1000 == 0) { + cout << "."; + cout.flush(); } + } + cout << endl; + long ti = z.split(); + cout << " Written " << howmany << " arrays of " << N << " integers" << endl; + if (ti > 0) { + double speed = static_cast(howmany * N) / static_cast(ti); + cout << " Speed: " << speed << " mis " << endl; + } } diff --git a/advancedbenchmarking/src/uncompress.cpp b/advancedbenchmarking/src/uncompress.cpp index d8c034a..ef283ed 100644 --- a/advancedbenchmarking/src/uncompress.cpp +++ b/advancedbenchmarking/src/uncompress.cpp @@ -15,135 +15,131 @@ using namespace SIMDCompressionLib; -void printusage() { - cout << " Try ./uncompress input.bin output.bin" << endl; - -} +void printusage() { cout << " Try ./uncompress input.bin output.bin" << endl; } int main(int argc, char **argv) { - string scheme; - int c; - while ((c = getopt(argc, argv, "h")) != -1) - switch (c) { - case 'h': - printusage(); - return 0; - default: - abort(); - } - if (optind + 1 >= argc) { - printusage(); - return -1; + string scheme; + int c; + while ((c = getopt(argc, argv, "h")) != -1) + switch (c) { + case 'h': + printusage(); + return 0; + default: + abort(); } - try { - string ifilename = argv[optind]; - string ofilename = argv[optind + 1]; - FILE *fd = ::fopen(ifilename.c_str(), "rb"); - if (fd == NULL) { - cerr << " can't open " << ifilename << endl; - return -1; - } - FILE *fdout = ::fopen(ofilename.c_str(), "wb"); - if (fdout == NULL) { - cerr << " can't open " << ofilename << endl; - ::fclose(fd); - return -1; - } - cout << "uncompressing " << ifilename << " to " << ofilename << endl; - uint32_t version; - size_t result = fread(&version, sizeof(version), 1, fd); - cout << "compressed file format version = " << version << endl; - if (result != 1) { - cerr << "wrong version" << endl; - ::fclose(fd); - ::fclose(fdout); - return -1; - } - uint32_t schemesize; - result = fread(&schemesize, sizeof(schemesize), 1, fd); - if (result != 1) { - ::fclose(fd); - ::fclose(fdout); - return -1; - } - vector b(schemesize + 1); - result = fread(b.data(), schemesize * sizeof(char), 1, fd); - if (result != 1) { - ::fclose(fd); - ::fclose(fdout); - return -1; - } - string schemename(b.data()); - cout << " data was compressed using " << schemename << endl; - shared_ptr schemeptr = CODECFactory::getFromName(schemename); - if (schemeptr.get() == NULL) { - return -1; - } - vector buffer, obuffer; - size_t i = 0; - size_t volume = 0; - WallClockTimer z; - while (true) { - uint32_t csize; - result = fread(&csize, sizeof(uint32_t), 1, fd); - if (result != 1) { - break; - } - uint32_t osize; - result = fread(&osize, sizeof(uint32_t), 1, fd); - if (result != 1) { - break; - } - obuffer.resize(osize); - buffer.resize(csize); - result = fread(buffer.data(), sizeof(uint32_t) * csize, 1, fd); - if (result != 1) { - break; - } - size_t sosize(osize); - schemeptr->decodeArray(buffer.data(), buffer.size(), obuffer.data(), sosize); - assert(sosize == osize); - if (fwrite(&osize, sizeof(osize), 1, fdout) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - ::fclose(fdout); - return -1; - } - if (fwrite(obuffer.data(), sizeof(uint32_t) * osize, 1, fdout) != 1) { - cerr << "aborting" << endl; - ::fclose(fd); - ::fclose(fdout); - return -1; - } - - if (i % 1000 == 0) { - cout << "."; - cout.flush(); - } - volume += osize; - ++i; - - } - cout << endl; - long ti = z.split(); - - cout << " Uncompressed " << i << " arrays and " << volume << " integers " << endl; + if (optind + 1 >= argc) { + printusage(); + return -1; + } + try { + string ifilename = argv[optind]; + string ofilename = argv[optind + 1]; + FILE *fd = ::fopen(ifilename.c_str(), "rb"); + if (fd == NULL) { + cerr << " can't open " << ifilename << endl; + return -1; + } + FILE *fdout = ::fopen(ofilename.c_str(), "wb"); + if (fdout == NULL) { + cerr << " can't open " << ofilename << endl; + ::fclose(fd); + return -1; + } + cout << "uncompressing " << ifilename << " to " << ofilename << endl; + uint32_t version; + size_t result = fread(&version, sizeof(version), 1, fd); + cout << "compressed file format version = " << version << endl; + if (result != 1) { + cerr << "wrong version" << endl; + ::fclose(fd); + ::fclose(fdout); + return -1; + } + uint32_t schemesize; + result = fread(&schemesize, sizeof(schemesize), 1, fd); + if (result != 1) { + ::fclose(fd); + ::fclose(fdout); + return -1; + } + vector b(schemesize + 1); + result = fread(b.data(), schemesize * sizeof(char), 1, fd); + if (result != 1) { + ::fclose(fd); + ::fclose(fdout); + return -1; + } + string schemename(b.data()); + cout << " data was compressed using " << schemename << endl; + shared_ptr schemeptr = CODECFactory::getFromName(schemename); + if (schemeptr.get() == NULL) { + return -1; + } + vector buffer, obuffer; + size_t i = 0; + size_t volume = 0; + WallClockTimer z; + while (true) { + uint32_t csize; + result = fread(&csize, sizeof(uint32_t), 1, fd); + if (result != 1) { + break; + } + uint32_t osize; + result = fread(&osize, sizeof(uint32_t), 1, fd); + if (result != 1) { + break; + } + obuffer.resize(osize); + buffer.resize(csize); + result = fread(buffer.data(), sizeof(uint32_t) * csize, 1, fd); + if (result != 1) { + break; + } + size_t sosize(osize); + schemeptr->decodeArray(buffer.data(), buffer.size(), obuffer.data(), + sosize); + assert(sosize == osize); + if (fwrite(&osize, sizeof(osize), 1, fdout) != 1) { + cerr << "aborting" << endl; ::fclose(fd); ::fclose(fdout); - if (ti > 0) { - double speed = static_cast(volume) / static_cast(ti); - cout << " Speed: " << speed << " mis " << endl; - } - - } catch (const exception &e) { - cerr << "Run-time error: " << e.what() << endl; return -1; - } catch (...) { - cerr << "Unknown exception caught, terminating..." << endl; + } + if (fwrite(obuffer.data(), sizeof(uint32_t) * osize, 1, fdout) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + ::fclose(fdout); return -1; + } + + if (i % 1000 == 0) { + cout << "."; + cout.flush(); + } + volume += osize; + ++i; } + cout << endl; + long ti = z.split(); - return 0; -} + cout << " Uncompressed " << i << " arrays and " << volume << " integers " + << endl; + ::fclose(fd); + ::fclose(fdout); + if (ti > 0) { + double speed = static_cast(volume) / static_cast(ti); + cout << " Speed: " << speed << " mis " << endl; + } + } catch (const exception &e) { + cerr << "Run-time error: " << e.what() << endl; + return -1; + } catch (...) { + cerr << "Unknown exception caught, terminating..." << endl; + return -1; + } + return 0; +} diff --git a/example.cpp b/example.cpp index 6f45c7a..2f6cf5f 100644 --- a/example.cpp +++ b/example.cpp @@ -15,69 +15,76 @@ using namespace SIMDCompressionLib; int main() { - // We pick a CODEC - IntegerCODEC &codec = * CODECFactory::getFromName("s4-bp128-dm"); - // could use others, e.g., "varint", "s-fastpfor-1" - //////////// - // - // create a container with some integers in it - // - // We need the integers to be in sorted order. - // - // (Note: You don't need to use a vector.) - // - size_t N = 10 * 1000; - vector mydata(N); - for (uint32_t i = 0; i < N; ++i) mydata[i] = 3 * i; - /////////// - // - // You need some "output" container. You are responsible - // for allocating enough memory. - // - vector compressed_output(N + 1024); - // N+1024 should be plenty - // - // - size_t compressedsize = compressed_output.size(); - codec.encodeArray(mydata.data(), mydata.size(), - compressed_output.data(), compressedsize); - // - // if desired, shrink back the array: - compressed_output.resize(compressedsize); - compressed_output.shrink_to_fit(); - // display compression rate: - cout << setprecision(3); - cout << "You are using " << 32.0 * static_cast(compressed_output.size()) / - static_cast(mydata.size()) << " bits per integer. " << endl; - // - // You are done!... with the compression... - // - /// - // decompressing is also easy: - // - vector mydataback(N); - size_t recoveredsize = mydataback.size(); - // - codec.decodeArray(compressed_output.data(), - compressed_output.size(), mydataback.data(), recoveredsize); - mydataback.resize(recoveredsize); - // - // That's it for compression! - // - if (mydataback != mydata) throw runtime_error("bug!"); + // We pick a CODEC + IntegerCODEC &codec = *CODECFactory::getFromName("s4-bp128-dm"); + // could use others, e.g., "varint", "s-fastpfor-1" + //////////// + // + // create a container with some integers in it + // + // We need the integers to be in sorted order. + // + // (Note: You don't need to use a vector.) + // + size_t N = 10 * 1000; + vector mydata(N); + for (uint32_t i = 0; i < N; ++i) + mydata[i] = 3 * i; + /////////// + // + // You need some "output" container. You are responsible + // for allocating enough memory. + // + vector compressed_output(N + 1024); + // N+1024 should be plenty + // + // + size_t compressedsize = compressed_output.size(); + codec.encodeArray(mydata.data(), mydata.size(), compressed_output.data(), + compressedsize); + // + // if desired, shrink back the array: + compressed_output.resize(compressedsize); + compressed_output.shrink_to_fit(); + // display compression rate: + cout << setprecision(3); + cout << "You are using " + << 32.0 * static_cast(compressed_output.size()) / + static_cast(mydata.size()) + << " bits per integer. " << endl; + // + // You are done!... with the compression... + // + /// + // decompressing is also easy: + // + vector mydataback(N); + size_t recoveredsize = mydataback.size(); + // + codec.decodeArray(compressed_output.data(), compressed_output.size(), + mydataback.data(), recoveredsize); + mydataback.resize(recoveredsize); + // + // That's it for compression! + // + if (mydataback != mydata) + throw runtime_error("bug!"); - // - // Next we are going to test out intersection... - // - vector mydata2(N); - for (uint32_t i = 0; i < N; ++i) mydata2[i] = 6 * i; - intersectionfunction inter = IntersectionFactory::getFromName("simd");// using SIMD intersection - // - // we are going to intersect mydata and mydata2 and write back - // the result to mydata2 - // - size_t intersize = inter(mydata2.data(), mydata2.size(), mydata.data(), mydata.size(), mydata2.data()); - mydata2.resize(intersize); - mydata2.shrink_to_fit(); - cout << "Intersection size: " << mydata2.size() << " integers. " << endl; + // + // Next we are going to test out intersection... + // + vector mydata2(N); + for (uint32_t i = 0; i < N; ++i) + mydata2[i] = 6 * i; + intersectionfunction inter = + IntersectionFactory::getFromName("simd"); // using SIMD intersection + // + // we are going to intersect mydata and mydata2 and write back + // the result to mydata2 + // + size_t intersize = inter(mydata2.data(), mydata2.size(), mydata.data(), + mydata.size(), mydata2.data()); + mydata2.resize(intersize); + mydata2.shrink_to_fit(); + cout << "Intersection size: " << mydata2.size() << " integers. " << endl; } diff --git a/include/VarIntG8IU.h b/include/VarIntG8IU.h index 409b0ed..b187f9a 100644 --- a/include/VarIntG8IU.h +++ b/include/VarIntG8IU.h @@ -3,7 +3,8 @@ * Apache License Version 2.0 http://www.apache.org/licenses/. */ #ifndef __SSSE3__ -#pragma message "Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3 or the equivalent on your compiler" +#pragma message \ + "Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3 or the equivalent on your compiler" #else #ifndef VARINTG8IU_H__ #define VARINTG8IU_H__ @@ -22,7 +23,8 @@ namespace SIMDCompressionLib { * Implementation of varint-G8IU taken from * Stepanov et al., SIMD-Based Decoding of Posting Lists, CIKM 2011 * - * Update: D. Lemire believes that this scheme was patented by Rose, Stepanov et al. (patent 20120221539). + * Update: D. Lemire believes that this scheme was patented by Rose, Stepanov et + * al. (patent 20120221539). * We wrote this code before the patent was published (August 2012). * * By Maxime Caron and Daniel Lemire @@ -34,204 +36,202 @@ namespace SIMDCompressionLib { * */ - -template -class VarIntG8IU: public IntegerCODEC { +template class VarIntG8IU : public IntegerCODEC { public: - - // For all possible values of the - // descriptor we build a table of any shuffle sequence - // that might be needed at decode time. - VarIntG8IU() { - char mask[256][32]; - for (int desc = 0; desc <= 255; desc++) { - memset(mask[desc],-1,32); - int bitmask = 0x00000001; - int bitindex = 0; - // count number of 0 in the char - int complete = 0; - int ithSize[8]; - int lastpos = -1; - while (bitindex < 8) { - if ((desc & bitmask) == 0) { - ithSize[complete] = bitindex - lastpos; - lastpos = bitindex; - complete++; - } - bitindex++; - bitmask = bitmask << 1; - } - maskOutputSize[desc] = complete; - - int j = 0; - int k = 0; - for (int i = 0; i < complete; i++) { - for (int n = 0; n < 4; n++) { - if (n < ithSize[i]) { - mask[desc][k] = static_cast(j); - j = j + 1; - } else { - mask[desc][k] = -1; - } - k = k + 1; - } - } - - } - for (int desc = 0; desc <= 255; desc++) { - vecmask[desc][0] = _mm_lddqu_si128 (reinterpret_cast<__m128i const*> (mask[desc])); - vecmask[desc][1] = _mm_lddqu_si128 (reinterpret_cast<__m128i const*> (mask[desc] + 16)); - } - - } - - void encodeArray(uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) { - uint32_t prev = 0; // for deltas - const uint32_t * src = in; - size_t srclength = length * 4;// number of input bytes - - unsigned char* dst = reinterpret_cast (out); - nvalue = nvalue * 4;// output bytes - - size_t compressed_size = 0; - while (srclength > 0 && nvalue >= 9) { - compressed_size += encodeBlock(src, srclength, dst, nvalue, prev); - } - //Ouput might not be a multiple of 4 so we make it so - nvalue = ((compressed_size + 3 )/ 4); - } - const uint32_t * decodeArray(const uint32_t *in, - const size_t length, uint32_t *out, size_t &nvalue) { - __m128i mprev = _mm_setzero_si128();// for deltas - const unsigned char* src = reinterpret_cast (in); - const uint32_t * const initdst = out; - - uint32_t * dst = out; - size_t srclength = length * 4; - for (; srclength >= 22; srclength -= 8, src += 8) { - unsigned char desc = *src; - src += 1; - srclength -= 1; - const __m128i data = _mm_lddqu_si128 (reinterpret_cast<__m128i const* > (src)); - const __m128i result = _mm_shuffle_epi8 (data,vecmask[desc][0]); - if(delta) { - mprev = RegularDeltaSIMD::PrefixSum(result,mprev); - _mm_storeu_si128(reinterpret_cast<__m128i*> (dst), mprev); - } else { - _mm_storeu_si128(reinterpret_cast<__m128i*> (dst), result); - } - int readSize = maskOutputSize[desc]; - if ( readSize > 4 ) { - const __m128i result2 = _mm_shuffle_epi8 (data, vecmask[desc][1]); - if(delta) { - mprev = RegularDeltaSIMD::PrefixSum(result2,mprev); - _mm_storeu_si128(reinterpret_cast<__m128i*> (dst + 4), mprev); - } else { - _mm_storeu_si128(reinterpret_cast<__m128i *> (dst + 4), result2); - } - } - dst += readSize; - } - while(srclength >= 9) { - unsigned char desc = *src; - src += 1; - srclength -= 1; - char buff[32]; - memcpy(buff, src, 8); - const __m128i data = _mm_lddqu_si128 (reinterpret_cast<__m128i const*> (buff)); - const __m128i result = _mm_shuffle_epi8 (data,vecmask[desc][0]); - if(delta) { - mprev = RegularDeltaSIMD::PrefixSum(result,mprev); - _mm_storeu_si128(reinterpret_cast<__m128i*> (buff), mprev); - } else { - _mm_storeu_si128(reinterpret_cast<__m128i*> (buff), result); - } - int readSize = maskOutputSize[desc]; - if ( readSize > 4 ) { - const __m128i result2 = _mm_shuffle_epi8 (data, vecmask[desc][1]); - if(delta) { - mprev = RegularDeltaSIMD::PrefixSum(result2,mprev); - _mm_storeu_si128(reinterpret_cast<__m128i*> (buff+16), mprev); - } else { - _mm_storeu_si128(reinterpret_cast<__m128i*> (buff+16), result2); - } - } - memcpy(dst, buff, 4 * readSize); - dst += readSize; - srclength -= 8; - src += 8; - } - - nvalue = (dst - initdst); - return reinterpret_cast ((reinterpret_cast (src) - + 3) & ~3); - } - - virtual std::string name() const { - return std::string("VarIntG8IU") + ((delta==1)?"scalardelta":((delta==2)?"delta":"")); - } - - int encodeBlock(const uint32_t*& src, size_t& srclength, - unsigned char*& dest, size_t& dstlength, uint32_t & prev) { - unsigned char desc = 0xFF; - unsigned char bitmask = 0x01; - uint32_t buffer[8]; - int ithSize[8]; - int length = 0; - int numInt = 0; - - while (srclength > 0) { - const uint32_t* temp = src; - - int byteNeeded = delta ? getNumByteNeeded(*temp - prev) : getNumByteNeeded(*temp); - - if (PREDICT_FALSE(length + byteNeeded > 8)) { - break; - } - - //flip the correct bit in desc - bitmask = static_cast(bitmask << (byteNeeded - 1)); - desc = desc ^ bitmask; - bitmask = static_cast(bitmask << 1); - - ithSize[numInt] = byteNeeded; - length += byteNeeded; - buffer[numInt] = delta ? *temp - prev : *temp; - if(delta ) - prev = *temp; - src = src + 1; - srclength -= 4; - numInt++; - } - - dest[0] = desc; - int written = 1; - for (int i = 0; i < numInt; i++) { - int size = ithSize[i]; - uint32_t value = buffer[i]; - for (int j = 0; j < size; j++) { - dest[written] = static_cast(value >> (j * 8)); - written++; - } - } - dest += 9; - dstlength -= 9; - return 9; - } + // For all possible values of the + // descriptor we build a table of any shuffle sequence + // that might be needed at decode time. + VarIntG8IU() { + char mask[256][32]; + for (int desc = 0; desc <= 255; desc++) { + memset(mask[desc], -1, 32); + int bitmask = 0x00000001; + int bitindex = 0; + // count number of 0 in the char + int complete = 0; + int ithSize[8]; + int lastpos = -1; + while (bitindex < 8) { + if ((desc & bitmask) == 0) { + ithSize[complete] = bitindex - lastpos; + lastpos = bitindex; + complete++; + } + bitindex++; + bitmask = bitmask << 1; + } + maskOutputSize[desc] = complete; + + int j = 0; + int k = 0; + for (int i = 0; i < complete; i++) { + for (int n = 0; n < 4; n++) { + if (n < ithSize[i]) { + mask[desc][k] = static_cast(j); + j = j + 1; + } else { + mask[desc][k] = -1; + } + k = k + 1; + } + } + } + for (int desc = 0; desc <= 255; desc++) { + vecmask[desc][0] = + _mm_lddqu_si128(reinterpret_cast<__m128i const *>(mask[desc])); + vecmask[desc][1] = + _mm_lddqu_si128(reinterpret_cast<__m128i const *>(mask[desc] + 16)); + } + } + + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + uint32_t prev = 0; // for deltas + const uint32_t *src = in; + size_t srclength = length * 4; // number of input bytes + + unsigned char *dst = reinterpret_cast(out); + nvalue = nvalue * 4; // output bytes + + size_t compressed_size = 0; + while (srclength > 0 && nvalue >= 9) { + compressed_size += encodeBlock(src, srclength, dst, nvalue, prev); + } + // Ouput might not be a multiple of 4 so we make it so + nvalue = ((compressed_size + 3) / 4); + } + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + __m128i mprev = _mm_setzero_si128(); // for deltas + const unsigned char *src = reinterpret_cast(in); + const uint32_t *const initdst = out; + + uint32_t *dst = out; + size_t srclength = length * 4; + for (; srclength >= 22; srclength -= 8, src += 8) { + unsigned char desc = *src; + src += 1; + srclength -= 1; + const __m128i data = + _mm_lddqu_si128(reinterpret_cast<__m128i const *>(src)); + const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]); + if (delta) { + mprev = RegularDeltaSIMD::PrefixSum(result, mprev); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), mprev); + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), result); + } + int readSize = maskOutputSize[desc]; + if (readSize > 4) { + const __m128i result2 = _mm_shuffle_epi8(data, vecmask[desc][1]); + if (delta) { + mprev = RegularDeltaSIMD::PrefixSum(result2, mprev); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + 4), mprev); + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + 4), result2); + } + } + dst += readSize; + } + while (srclength >= 9) { + unsigned char desc = *src; + src += 1; + srclength -= 1; + char buff[32]; + memcpy(buff, src, 8); + const __m128i data = + _mm_lddqu_si128(reinterpret_cast<__m128i const *>(buff)); + const __m128i result = _mm_shuffle_epi8(data, vecmask[desc][0]); + if (delta) { + mprev = RegularDeltaSIMD::PrefixSum(result, mprev); + _mm_storeu_si128(reinterpret_cast<__m128i *>(buff), mprev); + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(buff), result); + } + int readSize = maskOutputSize[desc]; + if (readSize > 4) { + const __m128i result2 = _mm_shuffle_epi8(data, vecmask[desc][1]); + if (delta) { + mprev = RegularDeltaSIMD::PrefixSum(result2, mprev); + _mm_storeu_si128(reinterpret_cast<__m128i *>(buff + 16), mprev); + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(buff + 16), result2); + } + } + memcpy(dst, buff, 4 * readSize); + dst += readSize; + srclength -= 8; + src += 8; + } + + nvalue = (dst - initdst); + return reinterpret_cast((reinterpret_cast(src) + 3) & + ~3); + } + + virtual std::string name() const { + return std::string("VarIntG8IU") + + ((delta == 1) ? "scalardelta" : ((delta == 2) ? "delta" : "")); + } + + int encodeBlock(const uint32_t *&src, size_t &srclength, unsigned char *&dest, + size_t &dstlength, uint32_t &prev) { + unsigned char desc = 0xFF; + unsigned char bitmask = 0x01; + uint32_t buffer[8]; + int ithSize[8]; + int length = 0; + int numInt = 0; + + while (srclength > 0) { + const uint32_t *temp = src; + + int byteNeeded = + delta ? getNumByteNeeded(*temp - prev) : getNumByteNeeded(*temp); + + if (PREDICT_FALSE(length + byteNeeded > 8)) { + break; + } + + // flip the correct bit in desc + bitmask = static_cast(bitmask << (byteNeeded - 1)); + desc = desc ^ bitmask; + bitmask = static_cast(bitmask << 1); + + ithSize[numInt] = byteNeeded; + length += byteNeeded; + buffer[numInt] = delta ? *temp - prev : *temp; + if (delta) + prev = *temp; + src = src + 1; + srclength -= 4; + numInt++; + } + + dest[0] = desc; + int written = 1; + for (int i = 0; i < numInt; i++) { + int size = ithSize[i]; + uint32_t value = buffer[i]; + for (int j = 0; j < size; j++) { + dest[written] = static_cast(value >> (j * 8)); + written++; + } + } + dest += 9; + dstlength -= 9; + return 9; + } private: + int maskOutputSize[256]; + __m128i vecmask[256][2]; - int maskOutputSize[256]; - __m128i vecmask[256][2]; - - int getNumByteNeeded(const uint32_t val) { - return ((__builtin_clz(val | 255) ^ 31) >> 3) + 1; - } - + int getNumByteNeeded(const uint32_t val) { + return ((__builtin_clz(val | 255) ^ 31) >> 3) + 1; + } }; } -#endif // VARINTG8IU_H__ +#endif // VARINTG8IU_H__ #endif //__SSE3__ - diff --git a/include/binarypacking.h b/include/binarypacking.h index 3c59b85..43955fb 100644 --- a/include/binarypacking.h +++ b/include/binarypacking.h @@ -14,179 +14,174 @@ namespace SIMDCompressionLib { - struct BasicBlockPacker { - static void inline unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &initoffset) { - BitPackingHelpers::fastunpack(in, out, bit); - if (bit < 32) inverseDelta(initoffset, out); - initoffset = *(out + BitPackingHelpers::BlockSize - 1); - } - - static uint32_t maxbits(const uint32_t *in, uint32_t &initoffset) { - uint32_t accumulator = in[0] - initoffset; - for (uint32_t k = 1; k < BitPackingHelpers::BlockSize; ++k) { - accumulator |= in[k] - in[k - 1]; - } - initoffset = in [BitPackingHelpers::BlockSize - 1]; - return gccbits(accumulator); + static void inline unpackblock(const uint32_t *in, uint32_t *out, + const uint32_t bit, uint32_t &initoffset) { + BitPackingHelpers::fastunpack(in, out, bit); + if (bit < 32) + inverseDelta(initoffset, out); + initoffset = *(out + BitPackingHelpers::BlockSize - 1); + } + + static uint32_t maxbits(const uint32_t *in, uint32_t &initoffset) { + uint32_t accumulator = in[0] - initoffset; + for (uint32_t k = 1; k < BitPackingHelpers::BlockSize; ++k) { + accumulator |= in[k] - in[k - 1]; } - - static void inline packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &initoffset) { - const uint32_t nextoffset = *(in + BitPackingHelpers::BlockSize - 1); - if (bit < 32) delta(initoffset, in); - BitPackingHelpers::fastpackwithoutmask(in, out, bit); - initoffset = nextoffset; - } - static string name() { - return "BasicBlockPacker"; - } - + initoffset = in[BitPackingHelpers::BlockSize - 1]; + return gccbits(accumulator); + } + + static void inline packblockwithoutmask(uint32_t *in, uint32_t *out, + const uint32_t bit, + uint32_t &initoffset) { + const uint32_t nextoffset = *(in + BitPackingHelpers::BlockSize - 1); + if (bit < 32) + delta(initoffset, in); + BitPackingHelpers::fastpackwithoutmask(in, out, bit); + initoffset = nextoffset; + } + static string name() { return "BasicBlockPacker"; } }; struct NoDeltaBlockPacker { - static void inline unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &) { - BitPackingHelpers::fastunpack(in, out, bit); - } - static void inline packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &) { - BitPackingHelpers::fastpackwithoutmask(in, out, bit); - } - - static uint32_t maxbits(const uint32_t *in, uint32_t &) { - uint32_t accumulator = 0; - for (uint32_t k = 0; k < BitPackingHelpers::BlockSize; ++k) { - accumulator |= in[k]; - } - return gccbits(accumulator); + static void inline unpackblock(const uint32_t *in, uint32_t *out, + const uint32_t bit, uint32_t &) { + BitPackingHelpers::fastunpack(in, out, bit); + } + static void inline packblockwithoutmask(uint32_t *in, uint32_t *out, + const uint32_t bit, uint32_t &) { + BitPackingHelpers::fastpackwithoutmask(in, out, bit); + } + + static uint32_t maxbits(const uint32_t *in, uint32_t &) { + uint32_t accumulator = 0; + for (uint32_t k = 0; k < BitPackingHelpers::BlockSize; ++k) { + accumulator |= in[k]; } + return gccbits(accumulator); + } - static string name() { - return "NoDeltaBlockPacker"; - } + static string name() { return "NoDeltaBlockPacker"; } }; - - struct IntegratedBlockPacker { - PURE_FUNCTION - static uint32_t maxbits(const uint32_t *in, uint32_t &initoffset) { - uint32_t accumulator = in[0] - initoffset; - for (uint32_t k = 1; k < BitPackingHelpers::BlockSize; ++k) { - accumulator |= in[k] - in[k - 1]; - } - initoffset = in [BitPackingHelpers::BlockSize - 1]; - return gccbits(accumulator); - } - - static void inline packblockwithoutmask(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &initoffset) { - BitPackingHelpers::integratedfastpackwithoutmask(initoffset, in, out, bit); - initoffset = *(in + BitPackingHelpers::BlockSize - 1); - } - static void inline unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &initoffset) { - BitPackingHelpers::integratedfastunpack(initoffset, in, out, bit); - initoffset = *(out + BitPackingHelpers::BlockSize - 1); - } - static string name() { - return "IntegratedBlockPacker"; + PURE_FUNCTION + static uint32_t maxbits(const uint32_t *in, uint32_t &initoffset) { + uint32_t accumulator = in[0] - initoffset; + for (uint32_t k = 1; k < BitPackingHelpers::BlockSize; ++k) { + accumulator |= in[k] - in[k - 1]; } + initoffset = in[BitPackingHelpers::BlockSize - 1]; + return gccbits(accumulator); + } + + static void inline packblockwithoutmask(const uint32_t *in, uint32_t *out, + const uint32_t bit, + uint32_t &initoffset) { + BitPackingHelpers::integratedfastpackwithoutmask(initoffset, in, out, bit); + initoffset = *(in + BitPackingHelpers::BlockSize - 1); + } + static void inline unpackblock(const uint32_t *in, uint32_t *out, + const uint32_t bit, uint32_t &initoffset) { + BitPackingHelpers::integratedfastunpack(initoffset, in, out, bit); + initoffset = *(out + BitPackingHelpers::BlockSize - 1); + } + static string name() { return "IntegratedBlockPacker"; } }; - - -template -class BinaryPacking: public IntegerCODEC { +template class BinaryPacking : public IntegerCODEC { public: - - - static const uint32_t MiniBlockSize = 32; - static const uint32_t HowManyMiniBlocks = 4; - static const uint32_t BlockSize = MiniBlockSize;//HowManyMiniBlocks * MiniBlockSize; - static const uint32_t bits32 = 8 ; - - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - checkifdivisibleby(length, BlockSize); - const uint32_t *const initout(out); - *out++ = static_cast(length); - uint32_t Bs[HowManyMiniBlocks]; - uint32_t init = 0; - const uint32_t *const final = in + length; - for (; in + HowManyMiniBlocks * MiniBlockSize - <= final; in += HowManyMiniBlocks * MiniBlockSize) { - uint32_t tmpinit = init; - for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { - Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); - } - *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) - | Bs[3]; - for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { - BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], init); - out += Bs[i]; - } - } - if (in < final) { - size_t howmany = (final - in) / MiniBlockSize; - uint32_t tmpinit = init; - memset(&Bs[0], 0, HowManyMiniBlocks * sizeof(uint32_t)); - for (uint32_t i = 0; i < howmany; ++i) { - Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); - } - *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) - | Bs[3]; - for (uint32_t i = 0; i < howmany; ++i) { - BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], init); - out += Bs[i]; - } - } - nvalue = out - initout; + static const uint32_t MiniBlockSize = 32; + static const uint32_t HowManyMiniBlocks = 4; + static const uint32_t BlockSize = + MiniBlockSize; // HowManyMiniBlocks * MiniBlockSize; + static const uint32_t bits32 = 8; + + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + checkifdivisibleby(length, BlockSize); + const uint32_t *const initout(out); + *out++ = static_cast(length); + uint32_t Bs[HowManyMiniBlocks]; + uint32_t init = 0; + const uint32_t *const final = in + length; + for (; in + HowManyMiniBlocks * MiniBlockSize <= final; + in += HowManyMiniBlocks * MiniBlockSize) { + uint32_t tmpinit = init; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); + } + *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) | Bs[3]; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], + init); + out += Bs[i]; + } } - - const uint32_t *decodeArray(const uint32_t *in, const size_t /*length*/, - uint32_t *out, size_t &nvalue) { - const uint32_t actuallength = *in++; - checkifdivisibleby(actuallength, BlockSize); - const uint32_t *const initout(out); - uint32_t Bs[HowManyMiniBlocks]; - uint32_t init = 0; - for (; out < initout + actuallength / (HowManyMiniBlocks * MiniBlockSize) *HowManyMiniBlocks * MiniBlockSize - ; out += HowManyMiniBlocks * MiniBlockSize) { - Bs[0] = static_cast(in[0] >> 24); - Bs[1] = static_cast(in[0] >> 16); - Bs[2] = static_cast(in[0] >> 8); - Bs[3] = static_cast(in[0]); - ++in; - for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { - BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init); - in += Bs[i]; - } - } - if (out < initout + actuallength) { - size_t howmany = (initout + actuallength - out) / MiniBlockSize; - Bs[0] = static_cast(in[0] >> 24); - Bs[1] = static_cast(in[0] >> 16); - Bs[2] = static_cast(in[0] >> 8); - Bs[3] = static_cast(in[0]); - ++in; - for (uint32_t i = 0; i < howmany; ++i) { - BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init); - in += Bs[i]; - } - out += howmany * MiniBlockSize; - - } - nvalue = out - initout; - return in; + if (in < final) { + size_t howmany = (final - in) / MiniBlockSize; + uint32_t tmpinit = init; + memset(&Bs[0], 0, HowManyMiniBlocks * sizeof(uint32_t)); + for (uint32_t i = 0; i < howmany; ++i) { + Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); + } + *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) | Bs[3]; + for (uint32_t i = 0; i < howmany; ++i) { + BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], + init); + out += Bs[i]; + } } - - string name() const { - ostringstream convert; - convert << "BinaryPacking" << "With" << BlockPacker::name() << MiniBlockSize; - return convert.str(); + nvalue = out - initout; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t /*length*/, + uint32_t *out, size_t &nvalue) { + const uint32_t actuallength = *in++; + checkifdivisibleby(actuallength, BlockSize); + const uint32_t *const initout(out); + uint32_t Bs[HowManyMiniBlocks]; + uint32_t init = 0; + for (; out < initout + + actuallength / (HowManyMiniBlocks * MiniBlockSize) * + HowManyMiniBlocks * MiniBlockSize; + out += HowManyMiniBlocks * MiniBlockSize) { + Bs[0] = static_cast(in[0] >> 24); + Bs[1] = static_cast(in[0] >> 16); + Bs[2] = static_cast(in[0] >> 8); + Bs[3] = static_cast(in[0]); + ++in; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init); + in += Bs[i]; + } } - + if (out < initout + actuallength) { + size_t howmany = (initout + actuallength - out) / MiniBlockSize; + Bs[0] = static_cast(in[0] >> 24); + Bs[1] = static_cast(in[0] >> 16); + Bs[2] = static_cast(in[0] >> 8); + Bs[3] = static_cast(in[0]); + ++in; + for (uint32_t i = 0; i < howmany; ++i) { + BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init); + in += Bs[i]; + } + out += howmany * MiniBlockSize; + } + nvalue = out - initout; + return in; + } + + string name() const { + ostringstream convert; + convert << "BinaryPacking" + << "With" << BlockPacker::name() << MiniBlockSize; + return convert.str(); + } }; - - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_BINARYPACKING_H_ */ diff --git a/include/bitpacking.h b/include/bitpacking.h index 207cd1e..6f67e96 100644 --- a/include/bitpacking.h +++ b/include/bitpacking.h @@ -11,108 +11,163 @@ namespace SIMDCompressionLib { -void __fastunpack0(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastunpack32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack0(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack10(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack11(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack12(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack13(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack14(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack15(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack17(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack18(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack19(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack20(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack21(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack22(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack23(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack24(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack25(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack26(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack27(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack28(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack29(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack30(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack31(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack32(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpack0(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack0(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpack32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); - -void __fastpackwithoutmask0(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __fastpackwithoutmask32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpackwithoutmask0(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask1(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask2(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask3(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask4(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask5(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask6(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask7(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask9(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask10(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask11(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask12(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask13(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask14(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask15(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask17(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask18(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask19(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask20(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask21(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask22(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask23(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask24(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask25(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask26(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask27(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask28(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask29(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask30(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask31(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask32(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); } // namespace SIMDCompressionLib diff --git a/include/bitpackinghelpers.h b/include/bitpackinghelpers.h index 341e1f2..a8ef3e8 100644 --- a/include/bitpackinghelpers.h +++ b/include/bitpackinghelpers.h @@ -16,671 +16,676 @@ namespace SIMDCompressionLib { struct BitPackingHelpers { - const static unsigned BlockSize = 32; - - static void inline fastunpack(const uint32_t *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - // Could have used function pointers instead of switch. - // Switch calls do offer the compiler more opportunities for optimization in - // theory. In this case, it makes no difference with a good compiler. - switch (bit) { - case 0: - __fastunpack0(in, out); - break; - case 1: - __fastunpack1(in, out); - break; - case 2: - __fastunpack2(in, out); - break; - case 3: - __fastunpack3(in, out); - break; - case 4: - __fastunpack4(in, out); - break; - case 5: - __fastunpack5(in, out); - break; - case 6: - __fastunpack6(in, out); - break; - case 7: - __fastunpack7(in, out); - break; - case 8: - __fastunpack8(in, out); - break; - case 9: - __fastunpack9(in, out); - break; - case 10: - __fastunpack10(in, out); - break; - case 11: - __fastunpack11(in, out); - break; - case 12: - __fastunpack12(in, out); - break; - case 13: - __fastunpack13(in, out); - break; - case 14: - __fastunpack14(in, out); - break; - case 15: - __fastunpack15(in, out); - break; - case 16: - __fastunpack16(in, out); - break; - case 17: - __fastunpack17(in, out); - break; - case 18: - __fastunpack18(in, out); - break; - case 19: - __fastunpack19(in, out); - break; - case 20: - __fastunpack20(in, out); - break; - case 21: - __fastunpack21(in, out); - break; - case 22: - __fastunpack22(in, out); - break; - case 23: - __fastunpack23(in, out); - break; - case 24: - __fastunpack24(in, out); - break; - case 25: - __fastunpack25(in, out); - break; - case 26: - __fastunpack26(in, out); - break; - case 27: - __fastunpack27(in, out); - break; - case 28: - __fastunpack28(in, out); - break; - case 29: - __fastunpack29(in, out); - break; - case 30: - __fastunpack30(in, out); - break; - case 31: - __fastunpack31(in, out); - break; - case 32: - __fastunpack32(in, out); - break; - default: - break; - } + const static unsigned BlockSize = 32; + + static void inline fastunpack(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastunpack0(in, out); + break; + case 1: + __fastunpack1(in, out); + break; + case 2: + __fastunpack2(in, out); + break; + case 3: + __fastunpack3(in, out); + break; + case 4: + __fastunpack4(in, out); + break; + case 5: + __fastunpack5(in, out); + break; + case 6: + __fastunpack6(in, out); + break; + case 7: + __fastunpack7(in, out); + break; + case 8: + __fastunpack8(in, out); + break; + case 9: + __fastunpack9(in, out); + break; + case 10: + __fastunpack10(in, out); + break; + case 11: + __fastunpack11(in, out); + break; + case 12: + __fastunpack12(in, out); + break; + case 13: + __fastunpack13(in, out); + break; + case 14: + __fastunpack14(in, out); + break; + case 15: + __fastunpack15(in, out); + break; + case 16: + __fastunpack16(in, out); + break; + case 17: + __fastunpack17(in, out); + break; + case 18: + __fastunpack18(in, out); + break; + case 19: + __fastunpack19(in, out); + break; + case 20: + __fastunpack20(in, out); + break; + case 21: + __fastunpack21(in, out); + break; + case 22: + __fastunpack22(in, out); + break; + case 23: + __fastunpack23(in, out); + break; + case 24: + __fastunpack24(in, out); + break; + case 25: + __fastunpack25(in, out); + break; + case 26: + __fastunpack26(in, out); + break; + case 27: + __fastunpack27(in, out); + break; + case 28: + __fastunpack28(in, out); + break; + case 29: + __fastunpack29(in, out); + break; + case 30: + __fastunpack30(in, out); + break; + case 31: + __fastunpack31(in, out); + break; + case 32: + __fastunpack32(in, out); + break; + default: + break; } - - - - static void inline fastpack(const uint32_t *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - // Could have used function pointers instead of switch. - // Switch calls do offer the compiler more opportunities for optimization in - // theory. In this case, it makes no difference with a good compiler. - switch (bit) { - case 0: - __fastpack0(in, out); - break; - case 1: - __fastpack1(in, out); - break; - case 2: - __fastpack2(in, out); - break; - case 3: - __fastpack3(in, out); - break; - case 4: - __fastpack4(in, out); - break; - case 5: - __fastpack5(in, out); - break; - case 6: - __fastpack6(in, out); - break; - case 7: - __fastpack7(in, out); - break; - case 8: - __fastpack8(in, out); - break; - case 9: - __fastpack9(in, out); - break; - case 10: - __fastpack10(in, out); - break; - case 11: - __fastpack11(in, out); - break; - case 12: - __fastpack12(in, out); - break; - case 13: - __fastpack13(in, out); - break; - case 14: - __fastpack14(in, out); - break; - case 15: - __fastpack15(in, out); - break; - case 16: - __fastpack16(in, out); - break; - case 17: - __fastpack17(in, out); - break; - case 18: - __fastpack18(in, out); - break; - case 19: - __fastpack19(in, out); - break; - case 20: - __fastpack20(in, out); - break; - case 21: - __fastpack21(in, out); - break; - case 22: - __fastpack22(in, out); - break; - case 23: - __fastpack23(in, out); - break; - case 24: - __fastpack24(in, out); - break; - case 25: - __fastpack25(in, out); - break; - case 26: - __fastpack26(in, out); - break; - case 27: - __fastpack27(in, out); - break; - case 28: - __fastpack28(in, out); - break; - case 29: - __fastpack29(in, out); - break; - case 30: - __fastpack30(in, out); - break; - case 31: - __fastpack31(in, out); - break; - case 32: - __fastpack32(in, out); - break; - default: - break; - } + } + + static void inline fastpack(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastpack0(in, out); + break; + case 1: + __fastpack1(in, out); + break; + case 2: + __fastpack2(in, out); + break; + case 3: + __fastpack3(in, out); + break; + case 4: + __fastpack4(in, out); + break; + case 5: + __fastpack5(in, out); + break; + case 6: + __fastpack6(in, out); + break; + case 7: + __fastpack7(in, out); + break; + case 8: + __fastpack8(in, out); + break; + case 9: + __fastpack9(in, out); + break; + case 10: + __fastpack10(in, out); + break; + case 11: + __fastpack11(in, out); + break; + case 12: + __fastpack12(in, out); + break; + case 13: + __fastpack13(in, out); + break; + case 14: + __fastpack14(in, out); + break; + case 15: + __fastpack15(in, out); + break; + case 16: + __fastpack16(in, out); + break; + case 17: + __fastpack17(in, out); + break; + case 18: + __fastpack18(in, out); + break; + case 19: + __fastpack19(in, out); + break; + case 20: + __fastpack20(in, out); + break; + case 21: + __fastpack21(in, out); + break; + case 22: + __fastpack22(in, out); + break; + case 23: + __fastpack23(in, out); + break; + case 24: + __fastpack24(in, out); + break; + case 25: + __fastpack25(in, out); + break; + case 26: + __fastpack26(in, out); + break; + case 27: + __fastpack27(in, out); + break; + case 28: + __fastpack28(in, out); + break; + case 29: + __fastpack29(in, out); + break; + case 30: + __fastpack30(in, out); + break; + case 31: + __fastpack31(in, out); + break; + case 32: + __fastpack32(in, out); + break; + default: + break; } - - - - /*assumes that integers fit in the prescribed number of bits*/ - static void inline fastpackwithoutmask(const uint32_t *__restrict__ in, uint32_t *__restrict__ out, - const uint32_t bit) { - // Could have used function pointers instead of switch. - // Switch calls do offer the compiler more opportunities for optimization in - // theory. In this case, it makes no difference with a good compiler. - switch (bit) { - case 0: - __fastpackwithoutmask0(in, out); - break; - case 1: - __fastpackwithoutmask1(in, out); - break; - case 2: - __fastpackwithoutmask2(in, out); - break; - case 3: - __fastpackwithoutmask3(in, out); - break; - case 4: - __fastpackwithoutmask4(in, out); - break; - case 5: - __fastpackwithoutmask5(in, out); - break; - case 6: - __fastpackwithoutmask6(in, out); - break; - case 7: - __fastpackwithoutmask7(in, out); - break; - case 8: - __fastpackwithoutmask8(in, out); - break; - case 9: - __fastpackwithoutmask9(in, out); - break; - case 10: - __fastpackwithoutmask10(in, out); - break; - case 11: - __fastpackwithoutmask11(in, out); - break; - case 12: - __fastpackwithoutmask12(in, out); - break; - case 13: - __fastpackwithoutmask13(in, out); - break; - case 14: - __fastpackwithoutmask14(in, out); - break; - case 15: - __fastpackwithoutmask15(in, out); - break; - case 16: - __fastpackwithoutmask16(in, out); - break; - case 17: - __fastpackwithoutmask17(in, out); - break; - case 18: - __fastpackwithoutmask18(in, out); - break; - case 19: - __fastpackwithoutmask19(in, out); - break; - case 20: - __fastpackwithoutmask20(in, out); - break; - case 21: - __fastpackwithoutmask21(in, out); - break; - case 22: - __fastpackwithoutmask22(in, out); - break; - case 23: - __fastpackwithoutmask23(in, out); - break; - case 24: - __fastpackwithoutmask24(in, out); - break; - case 25: - __fastpackwithoutmask25(in, out); - break; - case 26: - __fastpackwithoutmask26(in, out); - break; - case 27: - __fastpackwithoutmask27(in, out); - break; - case 28: - __fastpackwithoutmask28(in, out); - break; - case 29: - __fastpackwithoutmask29(in, out); - break; - case 30: - __fastpackwithoutmask30(in, out); - break; - case 31: - __fastpackwithoutmask31(in, out); - break; - case 32: - __fastpackwithoutmask32(in, out); - break; - default: - break; - } + } + + /*assumes that integers fit in the prescribed number of bits*/ + static void inline fastpackwithoutmask(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastpackwithoutmask0(in, out); + break; + case 1: + __fastpackwithoutmask1(in, out); + break; + case 2: + __fastpackwithoutmask2(in, out); + break; + case 3: + __fastpackwithoutmask3(in, out); + break; + case 4: + __fastpackwithoutmask4(in, out); + break; + case 5: + __fastpackwithoutmask5(in, out); + break; + case 6: + __fastpackwithoutmask6(in, out); + break; + case 7: + __fastpackwithoutmask7(in, out); + break; + case 8: + __fastpackwithoutmask8(in, out); + break; + case 9: + __fastpackwithoutmask9(in, out); + break; + case 10: + __fastpackwithoutmask10(in, out); + break; + case 11: + __fastpackwithoutmask11(in, out); + break; + case 12: + __fastpackwithoutmask12(in, out); + break; + case 13: + __fastpackwithoutmask13(in, out); + break; + case 14: + __fastpackwithoutmask14(in, out); + break; + case 15: + __fastpackwithoutmask15(in, out); + break; + case 16: + __fastpackwithoutmask16(in, out); + break; + case 17: + __fastpackwithoutmask17(in, out); + break; + case 18: + __fastpackwithoutmask18(in, out); + break; + case 19: + __fastpackwithoutmask19(in, out); + break; + case 20: + __fastpackwithoutmask20(in, out); + break; + case 21: + __fastpackwithoutmask21(in, out); + break; + case 22: + __fastpackwithoutmask22(in, out); + break; + case 23: + __fastpackwithoutmask23(in, out); + break; + case 24: + __fastpackwithoutmask24(in, out); + break; + case 25: + __fastpackwithoutmask25(in, out); + break; + case 26: + __fastpackwithoutmask26(in, out); + break; + case 27: + __fastpackwithoutmask27(in, out); + break; + case 28: + __fastpackwithoutmask28(in, out); + break; + case 29: + __fastpackwithoutmask29(in, out); + break; + case 30: + __fastpackwithoutmask30(in, out); + break; + case 31: + __fastpackwithoutmask31(in, out); + break; + case 32: + __fastpackwithoutmask32(in, out); + break; + default: + break; } - - - static void inline integratedfastunpack(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out, const uint32_t bit) { - // Could have used function pointers instead of switch. - // Switch calls do offer the compiler more opportunities for optimization in - // theory. In this case, it makes no difference with a good compiler. - switch (bit) { - case 0: - __integratedfastunpack0(initoffset, in, out); - break; - case 1: - __integratedfastunpack1(initoffset, in, out); - break; - case 2: - __integratedfastunpack2(initoffset, in, out); - break; - case 3: - __integratedfastunpack3(initoffset, in, out); - break; - case 4: - __integratedfastunpack4(initoffset, in, out); - break; - case 5: - __integratedfastunpack5(initoffset, in, out); - break; - case 6: - __integratedfastunpack6(initoffset, in, out); - break; - case 7: - __integratedfastunpack7(initoffset, in, out); - break; - case 8: - __integratedfastunpack8(initoffset, in, out); - break; - case 9: - __integratedfastunpack9(initoffset, in, out); - break; - case 10: - __integratedfastunpack10(initoffset, in, out); - break; - case 11: - __integratedfastunpack11(initoffset, in, out); - break; - case 12: - __integratedfastunpack12(initoffset, in, out); - break; - case 13: - __integratedfastunpack13(initoffset, in, out); - break; - case 14: - __integratedfastunpack14(initoffset, in, out); - break; - case 15: - __integratedfastunpack15(initoffset, in, out); - break; - case 16: - __integratedfastunpack16(initoffset, in, out); - break; - case 17: - __integratedfastunpack17(initoffset, in, out); - break; - case 18: - __integratedfastunpack18(initoffset, in, out); - break; - case 19: - __integratedfastunpack19(initoffset, in, out); - break; - case 20: - __integratedfastunpack20(initoffset, in, out); - break; - case 21: - __integratedfastunpack21(initoffset, in, out); - break; - case 22: - __integratedfastunpack22(initoffset, in, out); - break; - case 23: - __integratedfastunpack23(initoffset, in, out); - break; - case 24: - __integratedfastunpack24(initoffset, in, out); - break; - case 25: - __integratedfastunpack25(initoffset, in, out); - break; - case 26: - __integratedfastunpack26(initoffset, in, out); - break; - case 27: - __integratedfastunpack27(initoffset, in, out); - break; - case 28: - __integratedfastunpack28(initoffset, in, out); - break; - case 29: - __integratedfastunpack29(initoffset, in, out); - break; - case 30: - __integratedfastunpack30(initoffset, in, out); - break; - case 31: - __integratedfastunpack31(initoffset, in, out); - break; - case 32: - __integratedfastunpack32(initoffset, in, out); - break; - default: - break; - } + } + + static void inline integratedfastunpack(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __integratedfastunpack0(initoffset, in, out); + break; + case 1: + __integratedfastunpack1(initoffset, in, out); + break; + case 2: + __integratedfastunpack2(initoffset, in, out); + break; + case 3: + __integratedfastunpack3(initoffset, in, out); + break; + case 4: + __integratedfastunpack4(initoffset, in, out); + break; + case 5: + __integratedfastunpack5(initoffset, in, out); + break; + case 6: + __integratedfastunpack6(initoffset, in, out); + break; + case 7: + __integratedfastunpack7(initoffset, in, out); + break; + case 8: + __integratedfastunpack8(initoffset, in, out); + break; + case 9: + __integratedfastunpack9(initoffset, in, out); + break; + case 10: + __integratedfastunpack10(initoffset, in, out); + break; + case 11: + __integratedfastunpack11(initoffset, in, out); + break; + case 12: + __integratedfastunpack12(initoffset, in, out); + break; + case 13: + __integratedfastunpack13(initoffset, in, out); + break; + case 14: + __integratedfastunpack14(initoffset, in, out); + break; + case 15: + __integratedfastunpack15(initoffset, in, out); + break; + case 16: + __integratedfastunpack16(initoffset, in, out); + break; + case 17: + __integratedfastunpack17(initoffset, in, out); + break; + case 18: + __integratedfastunpack18(initoffset, in, out); + break; + case 19: + __integratedfastunpack19(initoffset, in, out); + break; + case 20: + __integratedfastunpack20(initoffset, in, out); + break; + case 21: + __integratedfastunpack21(initoffset, in, out); + break; + case 22: + __integratedfastunpack22(initoffset, in, out); + break; + case 23: + __integratedfastunpack23(initoffset, in, out); + break; + case 24: + __integratedfastunpack24(initoffset, in, out); + break; + case 25: + __integratedfastunpack25(initoffset, in, out); + break; + case 26: + __integratedfastunpack26(initoffset, in, out); + break; + case 27: + __integratedfastunpack27(initoffset, in, out); + break; + case 28: + __integratedfastunpack28(initoffset, in, out); + break; + case 29: + __integratedfastunpack29(initoffset, in, out); + break; + case 30: + __integratedfastunpack30(initoffset, in, out); + break; + case 31: + __integratedfastunpack31(initoffset, in, out); + break; + case 32: + __integratedfastunpack32(initoffset, in, out); + break; + default: + break; } - - - - /*assumes that integers fit in the prescribed number of bits*/ - static void inline integratedfastpackwithoutmask(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out, const uint32_t bit) { - // Could have used function pointers instead of switch. - // Switch calls do offer the compiler more opportunities for optimization in - // theory. In this case, it makes no difference with a good compiler. - switch (bit) { - case 0: - __integratedfastpack0(initoffset, in, out); - break; - case 1: - __integratedfastpack1(initoffset, in, out); - break; - case 2: - __integratedfastpack2(initoffset, in, out); - break; - case 3: - __integratedfastpack3(initoffset, in, out); - break; - case 4: - __integratedfastpack4(initoffset, in, out); - break; - case 5: - __integratedfastpack5(initoffset, in, out); - break; - case 6: - __integratedfastpack6(initoffset, in, out); - break; - case 7: - __integratedfastpack7(initoffset, in, out); - break; - case 8: - __integratedfastpack8(initoffset, in, out); - break; - case 9: - __integratedfastpack9(initoffset, in, out); - break; - case 10: - __integratedfastpack10(initoffset, in, out); - break; - case 11: - __integratedfastpack11(initoffset, in, out); - break; - case 12: - __integratedfastpack12(initoffset, in, out); - break; - case 13: - __integratedfastpack13(initoffset, in, out); - break; - case 14: - __integratedfastpack14(initoffset, in, out); - break; - case 15: - __integratedfastpack15(initoffset, in, out); - break; - case 16: - __integratedfastpack16(initoffset, in, out); - break; - case 17: - __integratedfastpack17(initoffset, in, out); - break; - case 18: - __integratedfastpack18(initoffset, in, out); - break; - case 19: - __integratedfastpack19(initoffset, in, out); - break; - case 20: - __integratedfastpack20(initoffset, in, out); - break; - case 21: - __integratedfastpack21(initoffset, in, out); - break; - case 22: - __integratedfastpack22(initoffset, in, out); - break; - case 23: - __integratedfastpack23(initoffset, in, out); - break; - case 24: - __integratedfastpack24(initoffset, in, out); - break; - case 25: - __integratedfastpack25(initoffset, in, out); - break; - case 26: - __integratedfastpack26(initoffset, in, out); - break; - case 27: - __integratedfastpack27(initoffset, in, out); - break; - case 28: - __integratedfastpack28(initoffset, in, out); - break; - case 29: - __integratedfastpack29(initoffset, in, out); - break; - case 30: - __integratedfastpack30(initoffset, in, out); - break; - case 31: - __integratedfastpack31(initoffset, in, out); - break; - case 32: - __integratedfastpack32(initoffset, in, out); - break; - default: - break; - } + } + + /*assumes that integers fit in the prescribed number of bits*/ + static void inline integratedfastpackwithoutmask( + const uint32_t initoffset, const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __integratedfastpack0(initoffset, in, out); + break; + case 1: + __integratedfastpack1(initoffset, in, out); + break; + case 2: + __integratedfastpack2(initoffset, in, out); + break; + case 3: + __integratedfastpack3(initoffset, in, out); + break; + case 4: + __integratedfastpack4(initoffset, in, out); + break; + case 5: + __integratedfastpack5(initoffset, in, out); + break; + case 6: + __integratedfastpack6(initoffset, in, out); + break; + case 7: + __integratedfastpack7(initoffset, in, out); + break; + case 8: + __integratedfastpack8(initoffset, in, out); + break; + case 9: + __integratedfastpack9(initoffset, in, out); + break; + case 10: + __integratedfastpack10(initoffset, in, out); + break; + case 11: + __integratedfastpack11(initoffset, in, out); + break; + case 12: + __integratedfastpack12(initoffset, in, out); + break; + case 13: + __integratedfastpack13(initoffset, in, out); + break; + case 14: + __integratedfastpack14(initoffset, in, out); + break; + case 15: + __integratedfastpack15(initoffset, in, out); + break; + case 16: + __integratedfastpack16(initoffset, in, out); + break; + case 17: + __integratedfastpack17(initoffset, in, out); + break; + case 18: + __integratedfastpack18(initoffset, in, out); + break; + case 19: + __integratedfastpack19(initoffset, in, out); + break; + case 20: + __integratedfastpack20(initoffset, in, out); + break; + case 21: + __integratedfastpack21(initoffset, in, out); + break; + case 22: + __integratedfastpack22(initoffset, in, out); + break; + case 23: + __integratedfastpack23(initoffset, in, out); + break; + case 24: + __integratedfastpack24(initoffset, in, out); + break; + case 25: + __integratedfastpack25(initoffset, in, out); + break; + case 26: + __integratedfastpack26(initoffset, in, out); + break; + case 27: + __integratedfastpack27(initoffset, in, out); + break; + case 28: + __integratedfastpack28(initoffset, in, out); + break; + case 29: + __integratedfastpack29(initoffset, in, out); + break; + case 30: + __integratedfastpack30(initoffset, in, out); + break; + case 31: + __integratedfastpack31(initoffset, in, out); + break; + case 32: + __integratedfastpack32(initoffset, in, out); + break; + default: + break; } + } - - - static void inline ipackwithoutmask(const uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % BlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - uint32_t initoffset = 0; - - for (size_t k = 0; k < Qty / BlockSize; ++k) { - integratedfastpackwithoutmask(initoffset, in + k * BlockSize, out + k * bit, bit); - initoffset = *(in + k * BlockSize + BlockSize - 1); - } + static void inline ipackwithoutmask(const uint32_t *in, const size_t Qty, + uint32_t *out, const uint32_t bit) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); } + uint32_t initoffset = 0; - - static void inline pack(uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % BlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - uint32_t initoffset = 0; - - for (size_t k = 0; k < Qty / BlockSize; ++k) { - const uint32_t nextoffset = *(in + k * BlockSize + BlockSize - 1); - if (bit < 32) delta(initoffset, in + k * BlockSize); - fastpack(in + k * BlockSize, out + k * bit, bit); - initoffset = nextoffset; - } + for (size_t k = 0; k < Qty / BlockSize; ++k) { + integratedfastpackwithoutmask(initoffset, in + k * BlockSize, + out + k * bit, bit); + initoffset = *(in + k * BlockSize + BlockSize - 1); } + } - static void inline packWithoutDelta(uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - for (size_t k = 0; k < Qty / BlockSize; ++k) { - fastpack(in + k * BlockSize, out + k * bit, bit); - } + static void inline pack(uint32_t *in, const size_t Qty, uint32_t *out, + const uint32_t bit) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); } - - static void inline unpack(const uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % BlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - uint32_t initoffset = 0; - - for (size_t k = 0; k < Qty / BlockSize; ++k) { - fastunpack(in + k * bit, out + k * BlockSize, bit); - if (bit < 32) inverseDelta(initoffset, out + k * BlockSize); - initoffset = *(out + k * BlockSize + BlockSize - 1); - } + uint32_t initoffset = 0; + + for (size_t k = 0; k < Qty / BlockSize; ++k) { + const uint32_t nextoffset = *(in + k * BlockSize + BlockSize - 1); + if (bit < 32) + delta(initoffset, in + k * BlockSize); + fastpack(in + k * BlockSize, out + k * bit, bit); + initoffset = nextoffset; } + } - static void inline unpackWithoutDelta(const uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - for (size_t k = 0; k < Qty / BlockSize; ++k) { - fastunpack(in + k * bit, out + k * BlockSize, bit); - } + static void inline packWithoutDelta(uint32_t *in, const size_t Qty, + uint32_t *out, const uint32_t bit) { + for (size_t k = 0; k < Qty / BlockSize; ++k) { + fastpack(in + k * BlockSize, out + k * bit, bit); } + } - - static void inline packwithoutmask(uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % BlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - uint32_t initoffset = 0; - - for (size_t k = 0; k < Qty / BlockSize; ++k) { - const uint32_t nextoffset = *(in + k * BlockSize + BlockSize - 1); - if (bit < 32) delta(initoffset, in + k * BlockSize); - fastpackwithoutmask(in + k * BlockSize, out + k * bit, bit); - initoffset = nextoffset; - } + static void inline unpack(const uint32_t *in, const size_t Qty, uint32_t *out, + const uint32_t bit) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); } + uint32_t initoffset = 0; - - static void inline packwithoutmaskWithoutDelta(uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - for (size_t k = 0; k < Qty / BlockSize; ++k) { - fastpackwithoutmask(in + k * BlockSize, out + k * bit, bit); - } + for (size_t k = 0; k < Qty / BlockSize; ++k) { + fastunpack(in + k * bit, out + k * BlockSize, bit); + if (bit < 32) + inverseDelta(initoffset, out + k * BlockSize); + initoffset = *(out + k * BlockSize + BlockSize - 1); } + } + static void inline unpackWithoutDelta(const uint32_t *in, const size_t Qty, + uint32_t *out, const uint32_t bit) { + for (size_t k = 0; k < Qty / BlockSize; ++k) { + fastunpack(in + k * bit, out + k * BlockSize, bit); + } + } - static void inline iunpack(const uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % BlockSize) { - throw std::logic_error("Incorrect # of entries."); - } + static void inline packwithoutmask(uint32_t *in, const size_t Qty, + uint32_t *out, const uint32_t bit) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + uint32_t initoffset = 0; + + for (size_t k = 0; k < Qty / BlockSize; ++k) { + const uint32_t nextoffset = *(in + k * BlockSize + BlockSize - 1); + if (bit < 32) + delta(initoffset, in + k * BlockSize); + fastpackwithoutmask(in + k * BlockSize, out + k * bit, bit); + initoffset = nextoffset; + } + } - uint32_t initoffset = 0; - for (size_t k = 0; k < Qty / BlockSize; ++k) { - integratedfastunpack(initoffset, in + k * bit, out + k * BlockSize, bit); - initoffset = *(out + k * BlockSize + BlockSize - 1); - } + static void inline packwithoutmaskWithoutDelta(uint32_t *in, const size_t Qty, + uint32_t *out, + const uint32_t bit) { + for (size_t k = 0; k < Qty / BlockSize; ++k) { + fastpackwithoutmask(in + k * BlockSize, out + k * bit, bit); } + } + static void inline iunpack(const uint32_t *in, const size_t Qty, + uint32_t *out, const uint32_t bit) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); + } - /*static void GenRandom(std::vector& data, int b) { - data[0] = random(b); + uint32_t initoffset = 0; + for (size_t k = 0; k < Qty / BlockSize; ++k) { + integratedfastunpack(initoffset, in + k * bit, out + k * BlockSize, bit); + initoffset = *(out + k * BlockSize + BlockSize - 1); + } + } - for(size_t i = 1 ; i < data.size() ; ++i ) - data[i] = random(b) + data[i-1]; - }*/ + /*static void GenRandom(std::vector& data, int b) { + data[0] = random(b); - static void CheckMaxDiff(const std::vector &refdata, unsigned bit) { - for (size_t i = 1; i < refdata.size(); ++i) { - if (gccbits(refdata[i] - refdata[i - 1]) > bit) throw std::runtime_error("bug"); + for(size_t i = 1 ; i < data.size() ; ++i ) + data[i] = random(b) + data[i-1]; + }*/ - } + static void CheckMaxDiff(const std::vector &refdata, unsigned bit) { + for (size_t i = 1; i < refdata.size(); ++i) { + if (gccbits(refdata[i] - refdata[i - 1]) > bit) + throw std::runtime_error("bug"); } + } }; - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_BITPACKINGHELPERS_H_ */ diff --git a/include/boolarray.h b/include/boolarray.h index 4e58098..6b0f9d0 100644 --- a/include/boolarray.h +++ b/include/boolarray.h @@ -13,192 +13,169 @@ namespace SIMDCompressionLib { using namespace std; - static inline int numberOfTrailingZeros(uint64_t x) { - if (x == 0) return 64; - return __builtin_ctzl(x); + if (x == 0) + return 64; + return __builtin_ctzl(x); } - - - class BoolArray { public: - - - vector buffer; - size_t sizeinbits; - BoolArray(const size_t n, const uint64_t initval = 0) : - buffer(n / 64 + (n % 64 == 0 ? 0 : 1), initval), - sizeinbits(n) { - } - - BoolArray() : - buffer(), sizeinbits(0) { - } - - BoolArray(const BoolArray &ba) : - buffer(ba.buffer), sizeinbits(ba.sizeinbits) { - } - - void inplaceIntersect(const BoolArray &other) { - assert(other.buffer.size() == buffer.size()); - for (size_t i = 0; i < buffer.size(); ++i) - buffer[i] &= other.buffer[i]; - } - - // this is no faster because the compiler will vectorize - // inplaceIntersect automagically? - void SIMDinplaceIntersect(const BoolArray &other) { - assert(other.buffer.size() == buffer.size()); - __m128i *bin = reinterpret_cast<__m128i *>(buffer.data()); - const __m128i *bo = reinterpret_cast(other.buffer.data()); - for (size_t i = 0; i < buffer.size() / 2; ++i) { - __m128i p1 = MM_LOAD_SI_128(bin + i); - __m128i p2 = MM_LOAD_SI_128(bo + i); - __m128i andp1p2 = _mm_and_si128(p1, p2); - _mm_storeu_si128(bin + i, andp1p2); - } - for (size_t i = buffer.size() / 2 * 2; i < buffer.size(); ++i) - buffer[i] &= other.buffer[i]; - } - - - void intersect(const BoolArray &other, BoolArray &output) { - assert(other.buffer.size() == buffer.size()); - output.buffer.resize(buffer.size()); - for (size_t i = 0; i < buffer.size(); ++i) - output.buffer[i] = buffer[i] & other.buffer[i]; - } - - - // this is no faster because the compiler will vectorize - // intersect automagically? - void SIMDintersect(const BoolArray &other, BoolArray &output) { - assert(other.buffer.size() == buffer.size()); - output.buffer.resize(buffer.size()); - const __m128i *bin = reinterpret_cast(buffer.data()); - const __m128i *bo = reinterpret_cast(other.buffer.data()); - __m128i *bout = reinterpret_cast<__m128i *>(output.buffer.data()); - - for (size_t i = 0; i < buffer.size() / 2; ++i) { - __m128i p1 = MM_LOAD_SI_128(bin + i); - __m128i p2 = MM_LOAD_SI_128(bo + i); - __m128i andp1p2 = _mm_and_si128(p1, p2); - _mm_storeu_si128(bout + i, andp1p2); - } - for (size_t i = buffer.size() / 2 * 2; i < buffer.size(); ++i) - output.buffer[i] = buffer[i] & other.buffer[i]; - } - - void setSizeInBits(const size_t sizeib) { - sizeinbits = sizeib; - } - - /** - * Write out this bitmap to a vector as a list of integers corresponding - * to set bits. The caller should have allocated enough memory. - */ - void toArray(vector &ans) { - uint32_t pos = 0; - for (uint32_t k = 0; k < buffer.size(); ++k) { - uint64_t myword = buffer[k]; - while (myword != 0) { - int ntz = __builtin_ctzl(myword); - ans[pos++] = k * 64 + ntz; - myword ^= (1ll << ntz); - } - } - ans.resize(pos); - } - - - /** - * This is a version of toArray where we write to a pointer. - * Returns the number of written ints. - */ - size_t toInts(uint32_t *out) { - size_t pos = 0; - for (uint32_t k = 0; k < buffer.size(); ++k) { - const uint64_t myword = buffer[k]; - for (int offset = 0; offset < 64; ++offset) { - if ((myword >> offset) == 0) break; - offset += numberOfTrailingZeros((myword >> offset)); - out[pos++] = 64 * k + offset; - } - } - return pos; - } - BoolArray &operator=(const BoolArray &x) { - this->buffer = x.buffer; - this->sizeinbits = x.sizeinbits; - return *this; - } - - /** - * set to true (whether it was already set to true or not) - * - * This is an expensive (random access) API, you really ought to - * prepare a new word and then append it. - */ - ALWAYS_INLINE - void set(const size_t pos) { - buffer[pos / 64] |= (static_cast(1) << (pos - % 64)); - } - - /** - * set to false (whether it was already set to false or not) - * - * This is an expensive (random access) API, you really ought to - * prepare a new word and then append it. - */ - ALWAYS_INLINE - void unset(const size_t pos) { - buffer[pos / 64] |= ~(static_cast(1) << (pos - % 64)); - } - - /** - * true of false? (set or unset) - */ - ALWAYS_INLINE - bool get(const size_t pos) const { - return (buffer[pos / 64] & (static_cast(1) << (pos - % 64))) != 0; - } - - /** - * set all bits to 0 - */ - void reset() { - memset(buffer.data(), 0, sizeof(uint64_t) * buffer.size());//memset can be slow, does it matter? - sizeinbits = 0; - } - - size_t sizeInBits() const { - return sizeinbits; - } - - size_t sizeInBytes() const { - return buffer.size() * sizeof(uint64_t); - } - - /** - * Return memory usage of a bitmap spanning n bits - */ - static size_t sizeInBytes(size_t n) { - size_t buffersize = n / 64 + (n % 64 == 0 ? 0 : 1); - return buffersize * sizeof(uint64_t); - } - - ~BoolArray() { - } - - + vector buffer; + size_t sizeinbits; + BoolArray(const size_t n, const uint64_t initval = 0) + : buffer(n / 64 + (n % 64 == 0 ? 0 : 1), initval), sizeinbits(n) {} + + BoolArray() : buffer(), sizeinbits(0) {} + + BoolArray(const BoolArray &ba) + : buffer(ba.buffer), sizeinbits(ba.sizeinbits) {} + + void inplaceIntersect(const BoolArray &other) { + assert(other.buffer.size() == buffer.size()); + for (size_t i = 0; i < buffer.size(); ++i) + buffer[i] &= other.buffer[i]; + } + + // this is no faster because the compiler will vectorize + // inplaceIntersect automagically? + void SIMDinplaceIntersect(const BoolArray &other) { + assert(other.buffer.size() == buffer.size()); + __m128i *bin = reinterpret_cast<__m128i *>(buffer.data()); + const __m128i *bo = reinterpret_cast(other.buffer.data()); + for (size_t i = 0; i < buffer.size() / 2; ++i) { + __m128i p1 = MM_LOAD_SI_128(bin + i); + __m128i p2 = MM_LOAD_SI_128(bo + i); + __m128i andp1p2 = _mm_and_si128(p1, p2); + _mm_storeu_si128(bin + i, andp1p2); + } + for (size_t i = buffer.size() / 2 * 2; i < buffer.size(); ++i) + buffer[i] &= other.buffer[i]; + } + + void intersect(const BoolArray &other, BoolArray &output) { + assert(other.buffer.size() == buffer.size()); + output.buffer.resize(buffer.size()); + for (size_t i = 0; i < buffer.size(); ++i) + output.buffer[i] = buffer[i] & other.buffer[i]; + } + + // this is no faster because the compiler will vectorize + // intersect automagically? + void SIMDintersect(const BoolArray &other, BoolArray &output) { + assert(other.buffer.size() == buffer.size()); + output.buffer.resize(buffer.size()); + const __m128i *bin = reinterpret_cast(buffer.data()); + const __m128i *bo = reinterpret_cast(other.buffer.data()); + __m128i *bout = reinterpret_cast<__m128i *>(output.buffer.data()); + + for (size_t i = 0; i < buffer.size() / 2; ++i) { + __m128i p1 = MM_LOAD_SI_128(bin + i); + __m128i p2 = MM_LOAD_SI_128(bo + i); + __m128i andp1p2 = _mm_and_si128(p1, p2); + _mm_storeu_si128(bout + i, andp1p2); + } + for (size_t i = buffer.size() / 2 * 2; i < buffer.size(); ++i) + output.buffer[i] = buffer[i] & other.buffer[i]; + } + + void setSizeInBits(const size_t sizeib) { sizeinbits = sizeib; } + + /** + * Write out this bitmap to a vector as a list of integers corresponding + * to set bits. The caller should have allocated enough memory. + */ + void toArray(vector &ans) { + uint32_t pos = 0; + for (uint32_t k = 0; k < buffer.size(); ++k) { + uint64_t myword = buffer[k]; + while (myword != 0) { + int ntz = __builtin_ctzl(myword); + ans[pos++] = k * 64 + ntz; + myword ^= (1ll << ntz); + } + } + ans.resize(pos); + } + + /** + * This is a version of toArray where we write to a pointer. + * Returns the number of written ints. + */ + size_t toInts(uint32_t *out) { + size_t pos = 0; + for (uint32_t k = 0; k < buffer.size(); ++k) { + const uint64_t myword = buffer[k]; + for (int offset = 0; offset < 64; ++offset) { + if ((myword >> offset) == 0) + break; + offset += numberOfTrailingZeros((myword >> offset)); + out[pos++] = 64 * k + offset; + } + } + return pos; + } + BoolArray &operator=(const BoolArray &x) { + this->buffer = x.buffer; + this->sizeinbits = x.sizeinbits; + return *this; + } + + /** + * set to true (whether it was already set to true or not) + * + * This is an expensive (random access) API, you really ought to + * prepare a new word and then append it. + */ + ALWAYS_INLINE + void set(const size_t pos) { + buffer[pos / 64] |= (static_cast(1) << (pos % 64)); + } + + /** + * set to false (whether it was already set to false or not) + * + * This is an expensive (random access) API, you really ought to + * prepare a new word and then append it. + */ + ALWAYS_INLINE + void unset(const size_t pos) { + buffer[pos / 64] |= ~(static_cast(1) << (pos % 64)); + } + + /** + * true of false? (set or unset) + */ + ALWAYS_INLINE + bool get(const size_t pos) const { + return (buffer[pos / 64] & (static_cast(1) << (pos % 64))) != 0; + } + + /** + * set all bits to 0 + */ + void reset() { + memset(buffer.data(), 0, + sizeof(uint64_t) * + buffer.size()); // memset can be slow, does it matter? + sizeinbits = 0; + } + + size_t sizeInBits() const { return sizeinbits; } + + size_t sizeInBytes() const { return buffer.size() * sizeof(uint64_t); } + + /** + * Return memory usage of a bitmap spanning n bits + */ + static size_t sizeInBytes(size_t n) { + size_t buffersize = n / 64 + (n % 64 == 0 ? 0 : 1); + return buffersize * sizeof(uint64_t); + } + + ~BoolArray() {} }; - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_BOOLARRAY_H_ */ diff --git a/include/codecfactory.h b/include/codecfactory.h index e0449fd..ddaf7d8 100644 --- a/include/codecfactory.h +++ b/include/codecfactory.h @@ -33,153 +33,153 @@ namespace SIMDCompressionLib { using namespace std; -typedef VariableByte leftovercodec; +typedef VariableByte leftovercodec; static std::map> initializefactory() { - std::map > schemes; + std::map> schemes; #ifdef __SSSE3__ - schemes["varintg8iu"] = shared_ptr ( - new VarIntG8IU ()); + schemes["varintg8iu"] = shared_ptr(new VarIntG8IU()); #endif /* __SSSE3__ */ - schemes["fastpfor"] = shared_ptr ( - new CompositeCodec , - leftovercodec> ()); - - - schemes["copy"] = shared_ptr (new JustCopy()); - schemes["varint"] = shared_ptr (new VariableByte ()); - schemes["vbyte"] = shared_ptr (new VByte ()); - schemes["maskedvbyte"] = shared_ptr ( - new MaskedVByte ()); - schemes["streamvbyte"] = shared_ptr ( - new StreamVByteD1 ()); - schemes["frameofreference"] = shared_ptr ( - new FrameOfReference ()); - - schemes["simdframeofreference"] = shared_ptr ( - new SIMDFrameOfReference ()); - - schemes["varintgb"] = std::shared_ptr (new VarIntGB()); - - schemes["s4-fastpfor-d4"] = shared_ptr ( - new CompositeCodec , leftovercodec> ()); - schemes["s4-fastpfor-dm"] - = shared_ptr ( - new CompositeCodec , - VariableByte> ()); - schemes["s4-fastpfor-d1"] = shared_ptr ( - new CompositeCodec , leftovercodec> ()); - schemes["s4-fastpfor-d2"] = shared_ptr ( - new CompositeCodec , leftovercodec> ()); - - schemes["bp32"] = shared_ptr ( - new CompositeCodec , VariableByte< - true>> ()); - schemes["ibp32"] = shared_ptr ( - new CompositeCodec , - leftovercodec> ()); - - schemes["s4-bp128-d1-ni"] = shared_ptr ( - new CompositeCodec > , leftovercodec> ()); - schemes["s4-bp128-d2-ni"] = shared_ptr ( - new CompositeCodec >, leftovercodec> ()); - schemes["s4-bp128-d4-ni"] = shared_ptr ( - new CompositeCodec >, leftovercodec> ()); - schemes["s4-bp128-dm-ni"] = shared_ptr ( - new CompositeCodec >, leftovercodec> ()); - - schemes["s4-bp128-d1"] = shared_ptr ( - new CompositeCodec >, leftovercodec> ()); - schemes["s4-bp128-d2"] = shared_ptr ( - new CompositeCodec >, leftovercodec> ()); - schemes["s4-bp128-d4"] = shared_ptr ( - new CompositeCodec >, leftovercodec> ()); - schemes["s4-bp128-dm"] = shared_ptr ( - new CompositeCodec >, leftovercodec> ()); - schemes["for"] = shared_ptr ( - new ForCODEC ()); - return schemes; + schemes["fastpfor"] = shared_ptr( + new CompositeCodec, leftovercodec>()); + + schemes["copy"] = shared_ptr(new JustCopy()); + schemes["varint"] = shared_ptr(new VariableByte()); + schemes["vbyte"] = shared_ptr(new VByte()); + schemes["maskedvbyte"] = shared_ptr(new MaskedVByte()); + schemes["streamvbyte"] = shared_ptr(new StreamVByteD1()); + schemes["frameofreference"] = + shared_ptr(new FrameOfReference()); + + schemes["simdframeofreference"] = + shared_ptr(new SIMDFrameOfReference()); + + schemes["varintgb"] = std::shared_ptr(new VarIntGB()); + + schemes["s4-fastpfor-d4"] = shared_ptr( + new CompositeCodec, leftovercodec>()); + schemes["s4-fastpfor-dm"] = shared_ptr( + new CompositeCodec, VariableByte>()); + schemes["s4-fastpfor-d1"] = shared_ptr( + new CompositeCodec, leftovercodec>()); + schemes["s4-fastpfor-d2"] = shared_ptr( + new CompositeCodec, leftovercodec>()); + + schemes["bp32"] = shared_ptr( + new CompositeCodec, + VariableByte>()); + schemes["ibp32"] = shared_ptr( + new CompositeCodec, + leftovercodec>()); + + schemes["s4-bp128-d1-ni"] = shared_ptr( + new CompositeCodec< + SIMDBinaryPacking>, + leftovercodec>()); + schemes["s4-bp128-d2-ni"] = shared_ptr( + new CompositeCodec< + SIMDBinaryPacking>, + leftovercodec>()); + schemes["s4-bp128-d4-ni"] = shared_ptr( + new CompositeCodec< + SIMDBinaryPacking>, + leftovercodec>()); + schemes["s4-bp128-dm-ni"] = shared_ptr( + new CompositeCodec< + SIMDBinaryPacking>, + leftovercodec>()); + + schemes["s4-bp128-d1"] = shared_ptr( + new CompositeCodec< + SIMDBinaryPacking>, + leftovercodec>()); + schemes["s4-bp128-d2"] = shared_ptr( + new CompositeCodec< + SIMDBinaryPacking>, + leftovercodec>()); + schemes["s4-bp128-d4"] = shared_ptr( + new CompositeCodec< + SIMDBinaryPacking>, + leftovercodec>()); + schemes["s4-bp128-dm"] = shared_ptr( + new CompositeCodec< + SIMDBinaryPacking>, + leftovercodec>()); + schemes["for"] = shared_ptr(new ForCODEC()); + return schemes; } - - class CODECFactory { public: - static map> scodecmap; - static shared_ptr defaultptr; - - // hacked for convenience - static vector> allSchemes() { - vector > ans; - for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { - ans.push_back(i->second); - } - return ans; + static map> scodecmap; + static shared_ptr defaultptr; + + // hacked for convenience + static vector> allSchemes() { + vector> ans; + for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { + ans.push_back(i->second); } + return ans; + } - static vector allNames() { - vector ans; - for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { - ans.push_back(i->first); - } - return ans; + static vector allNames() { + vector ans; + for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { + ans.push_back(i->first); } - - /** - * This function tries to determine whether the - * input is modified during compression. - */ - static bool modifiesInputDuringCompression(IntegerCODEC &v) { - vector test; - const uint32_t N = 2049; - for (uint32_t k = 0; k < N; ++k) - test.emplace_back(k); - vector out(N + 1024); - size_t outsize = out.size(); - v.encodeArray(test.data(), N, out.data(), outsize); - for (uint32_t k = 0; k < N; ++k) - if (test[k] != k) return true; - return false; // granted this is not full-proof, but is ok in our context + return ans; + } + + /** + * This function tries to determine whether the + * input is modified during compression. + */ + static bool modifiesInputDuringCompression(IntegerCODEC &v) { + vector test; + const uint32_t N = 2049; + for (uint32_t k = 0; k < N; ++k) + test.emplace_back(k); + vector out(N + 1024); + size_t outsize = out.size(); + v.encodeArray(test.data(), N, out.data(), outsize); + for (uint32_t k = 0; k < N; ++k) + if (test[k] != k) + return true; + return false; // granted this is not full-proof, but is ok in our context + } + + static string getName(IntegerCODEC &v) { + for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { + if (i->second.get() == &v) + return i->first; } - - static string getName(IntegerCODEC &v) { - for (auto i = scodecmap.begin(); i != scodecmap.end() ; ++i) { - if (i->second.get() == &v) - return i->first; - } - return "UNKNOWN"; + return "UNKNOWN"; + } + + static bool valid(string name) { + return (scodecmap.find(name) != scodecmap.end()); + } + + static shared_ptr &getFromName(string name) { + if (scodecmap.find(name) == scodecmap.end()) { + cerr << "name " << name << " does not refer to a CODEC." << endl; + cerr << "possible choices:" << endl; + for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { + cerr << static_cast(i->first) + << endl; // useless cast, but just to be clear + } + return defaultptr; } - - static bool valid(string name) { - return (scodecmap.find(name) != scodecmap.end()) ; - } - - static shared_ptr &getFromName(string name) { - if (scodecmap.find(name) == scodecmap.end()) { - cerr << "name " << name << " does not refer to a CODEC." << endl; - cerr << "possible choices:" << endl; - for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { - cerr << static_cast(i->first) << endl; // useless cast, but just to be clear - } - return defaultptr; - } - return scodecmap[name]; - } - + return scodecmap[name]; + } }; map> CODECFactory::scodecmap = - initializefactory(); + initializefactory(); -shared_ptr CODECFactory::defaultptr = shared_ptr(nullptr); +shared_ptr CODECFactory::defaultptr = + shared_ptr(nullptr); } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_CODECFACTORY_H_ */ diff --git a/include/codecs.h b/include/codecs.h index 881ef94..f90dddc 100644 --- a/include/codecs.h +++ b/include/codecs.h @@ -8,7 +8,6 @@ #ifndef SIMDCompressionAndIntersection_CODECS_H_ #define SIMDCompressionAndIntersection_CODECS_H_ - #include "common.h" #include "util.h" #include "bitpackinghelpers.h" @@ -17,124 +16,116 @@ namespace SIMDCompressionLib { using namespace std; -class NotEnoughStorage: public std::runtime_error { +class NotEnoughStorage : public std::runtime_error { public: - size_t required;// number of 32-bit symbols required - NotEnoughStorage(const size_t req) : - runtime_error(""), required(req) { - - } + size_t required; // number of 32-bit symbols required + NotEnoughStorage(const size_t req) : runtime_error(""), required(req) {} }; class IntegerCODEC { public: - - /** - * You specify input and input length, as well as - * output and output length. nvalue gets modified to - * reflect how much was used. If the new value of - * nvalue is more than the original value, we can - * consider this a buffer overrun. - * - * You are responsible for allocating the memory (length - * for *in and nvalue for *out). - */ - virtual void encodeArray(uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) = 0; - - /** - * Usage is similar to encodeArray except that it returns a pointer - * incremented from in. In theory it should be in+length. If the - * returned pointer is less than in+length, then this generally means - * that the decompression is not finished (some scheme compress - * the bulk of the data one way, and they then they compress remaining - * integers using another scheme). - * - * As with encodeArray, you need to have length elements allocated - * for *in and at least nvalue elements allocated for out. The value - * of the variable nvalue gets updated with the number actually used - * (if nvalue exceeds the original value, there might be a buffer - * overrun). - */ - virtual const uint32_t *decodeArray(const uint32_t *in, - const size_t length, uint32_t *out, size_t &nvalue) = 0; - virtual ~IntegerCODEC() { - } - - /** - * Will compress the content of a vector into - * another vector. - * - * This is offered for convenience. It might be slow. - */ - virtual vector compress(vector &data) { - vector compresseddata(data.size() * 2 + 1024);// allocate plenty of memory - size_t memavailable = compresseddata.size(); - encodeArray(data.data(), data.size(), compresseddata.data(), memavailable); - compresseddata.resize(memavailable); - return compresseddata; - } - - /** - * Will uncompress the content of a vector into - * another vector. Some CODECs know exactly how much data to uncompress, - * others need to uncompress it all to know how data there is to uncompress... - * So it useful to have a hint (expected_uncompressed_size) that tells how - * much data there will be to uncompress. Otherwise, the code will - * try to guess, but the result is uncertain and inefficient. You really - * ought to keep track of how many symbols you had compressed. - * - * For convenience. Might be slow. - */ - virtual vector uncompress( - vector &compresseddata, - size_t expected_uncompressed_size = 0) { - vector data(expected_uncompressed_size);// allocate plenty of memory - size_t memavailable = data.size(); - try { - decodeArray(compresseddata.data(), compresseddata.size(), data.data(), - memavailable); - } catch (NotEnoughStorage &nes) { - data.resize(nes.required + 1024); - decodeArray(compresseddata.data(), compresseddata.size(), data.data(), - memavailable); - - } - data.resize(memavailable); - return data; + /** + * You specify input and input length, as well as + * output and output length. nvalue gets modified to + * reflect how much was used. If the new value of + * nvalue is more than the original value, we can + * consider this a buffer overrun. + * + * You are responsible for allocating the memory (length + * for *in and nvalue for *out). + */ + virtual void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) = 0; + + /** + * Usage is similar to encodeArray except that it returns a pointer + * incremented from in. In theory it should be in+length. If the + * returned pointer is less than in+length, then this generally means + * that the decompression is not finished (some scheme compress + * the bulk of the data one way, and they then they compress remaining + * integers using another scheme). + * + * As with encodeArray, you need to have length elements allocated + * for *in and at least nvalue elements allocated for out. The value + * of the variable nvalue gets updated with the number actually used + * (if nvalue exceeds the original value, there might be a buffer + * overrun). + */ + virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) = 0; + virtual ~IntegerCODEC() {} + + /** + * Will compress the content of a vector into + * another vector. + * + * This is offered for convenience. It might be slow. + */ + virtual vector compress(vector &data) { + vector compresseddata(data.size() * 2 + + 1024); // allocate plenty of memory + size_t memavailable = compresseddata.size(); + encodeArray(data.data(), data.size(), compresseddata.data(), memavailable); + compresseddata.resize(memavailable); + return compresseddata; + } + + /** + * Will uncompress the content of a vector into + * another vector. Some CODECs know exactly how much data to uncompress, + * others need to uncompress it all to know how data there is to uncompress... + * So it useful to have a hint (expected_uncompressed_size) that tells how + * much data there will be to uncompress. Otherwise, the code will + * try to guess, but the result is uncertain and inefficient. You really + * ought to keep track of how many symbols you had compressed. + * + * For convenience. Might be slow. + */ + virtual vector uncompress(vector &compresseddata, + size_t expected_uncompressed_size = 0) { + vector data( + expected_uncompressed_size); // allocate plenty of memory + size_t memavailable = data.size(); + try { + decodeArray(compresseddata.data(), compresseddata.size(), data.data(), + memavailable); + } catch (NotEnoughStorage &nes) { + data.resize(nes.required + 1024); + decodeArray(compresseddata.data(), compresseddata.size(), data.data(), + memavailable); } + data.resize(memavailable); + return data; + } - virtual string name() const = 0; + virtual string name() const = 0; }; /****************** * This just copies the data, no compression. */ -class JustCopy: public IntegerCODEC { +class JustCopy : public IntegerCODEC { public: - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - memcpy(out, in, sizeof(uint32_t) * length); - nvalue = length; - } - // like encodeArray, but we don't actually copy - void fakeencodeArray(const uint32_t * /*in*/, const size_t length, - size_t &nvalue) { - nvalue = length; - } - - const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) { - memcpy(out, in, sizeof(uint32_t) * length); - nvalue = length; - return in + length; - } - string name() const { - return "JustCopy"; - } + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + memcpy(out, in, sizeof(uint32_t) * length); + nvalue = length; + } + // like encodeArray, but we don't actually copy + void fakeencodeArray(const uint32_t * /*in*/, const size_t length, + size_t &nvalue) { + nvalue = length; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + memcpy(out, in, sizeof(uint32_t) * length); + nvalue = length; + return in + length; + } + string name() const { return "JustCopy"; } }; - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_CODECS_H_ */ diff --git a/include/common.h b/include/common.h index 2aacedd..ffb22d3 100644 --- a/include/common.h +++ b/include/common.h @@ -7,7 +7,6 @@ #ifndef SIMDCompressionAndIntersection_COMMON_H_ #define SIMDCompressionAndIntersection_COMMON_H_ - #include #include #include @@ -49,16 +48,13 @@ #include "platform.h" #ifdef USE_ALIGNED -# define MM_LOAD_SI_128 _mm_load_si128 -# define MM_STORE_SI_128 _mm_store_si128 +#define MM_LOAD_SI_128 _mm_load_si128 +#define MM_STORE_SI_128 _mm_store_si128 #else -# define MM_LOAD_SI_128 _mm_loadu_si128 -# define MM_STORE_SI_128 _mm_storeu_si128 +#define MM_LOAD_SI_128 _mm_loadu_si128 +#define MM_STORE_SI_128 _mm_storeu_si128 #endif -namespace SIMDCompressionLib { - - -} // namespace SIMDCompressionLib +namespace SIMDCompressionLib {} // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_COMMON_H_ */ diff --git a/include/compositecodec.h b/include/compositecodec.h index 11e551e..c1e1577 100644 --- a/include/compositecodec.h +++ b/include/compositecodec.h @@ -17,54 +17,51 @@ namespace SIMDCompressionLib { * This is a useful class for CODEC that only compress * data having length a multiple of some unit length. */ -template -class CompositeCodec: public IntegerCODEC { +template +class CompositeCodec : public IntegerCODEC { public: - CompositeCodec() : - codec1(), codec2() { - } - Codec1 codec1; - Codec2 codec2; - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - const size_t roundedlength = length / Codec1::BlockSize - * Codec1::BlockSize; - size_t nvalue1 = nvalue; - codec1.encodeArray(in, roundedlength, out, nvalue1); + CompositeCodec() : codec1(), codec2() {} + Codec1 codec1; + Codec2 codec2; + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + const size_t roundedlength = length / Codec1::BlockSize * Codec1::BlockSize; + size_t nvalue1 = nvalue; + codec1.encodeArray(in, roundedlength, out, nvalue1); - if (roundedlength < length) { - ASSERT(nvalue >= nvalue1, nvalue << " " << nvalue1); - size_t nvalue2 = nvalue - nvalue1; - codec2.encodeArray(in + roundedlength, length - roundedlength, - out + nvalue1, nvalue2); - nvalue = nvalue1 + nvalue2; - } else { - nvalue = nvalue1; - } - } - const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) { - const uint32_t *const initin(in); - size_t mynvalue1 = nvalue; - const uint32_t *in2 = codec1.decodeArray(in, length, out, mynvalue1); - if (length + in > in2) { - assert(nvalue > mynvalue1); - size_t nvalue2 = nvalue - mynvalue1; - const uint32_t *in3 = codec2.decodeArray(in2, length - (in2 - in), - out + mynvalue1, nvalue2); - nvalue = mynvalue1 + nvalue2; - assert(initin + length >= in3); - return in3; - } - nvalue = mynvalue1; - assert(initin + length >= in2); - return in2; + if (roundedlength < length) { + ASSERT(nvalue >= nvalue1, nvalue << " " << nvalue1); + size_t nvalue2 = nvalue - nvalue1; + codec2.encodeArray(in + roundedlength, length - roundedlength, + out + nvalue1, nvalue2); + nvalue = nvalue1 + nvalue2; + } else { + nvalue = nvalue1; } - string name() const { - ostringstream convert; - convert << codec1.name() << "+" << codec2.name(); - return convert.str(); + } + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + const uint32_t *const initin(in); + size_t mynvalue1 = nvalue; + const uint32_t *in2 = codec1.decodeArray(in, length, out, mynvalue1); + if (length + in > in2) { + assert(nvalue > mynvalue1); + size_t nvalue2 = nvalue - mynvalue1; + const uint32_t *in3 = codec2.decodeArray(in2, length - (in2 - in), + out + mynvalue1, nvalue2); + nvalue = mynvalue1 + nvalue2; + assert(initin + length >= in3); + return in3; } + nvalue = mynvalue1; + assert(initin + length >= in2); + return in2; + } + string name() const { + ostringstream convert; + convert << codec1.name() << "+" << codec2.name(); + return convert.str(); + } }; } // namespace SIMDCompressionLib diff --git a/include/delta.h b/include/delta.h index fa2fa05..4ca74c4 100644 --- a/include/delta.h +++ b/include/delta.h @@ -8,7 +8,6 @@ #ifndef SIMDCompressionAndIntersection_DELTA_H_ #define SIMDCompressionAndIntersection_DELTA_H_ - #include "common.h" namespace SIMDCompressionLib { @@ -18,70 +17,70 @@ namespace SIMDCompressionLib { * include any other header file. */ - -template -void delta(const T initoffset, T *data, const size_t size) { - if (size == 0) return; // nothing to do - if (size > 1) - for (size_t i = size - 1; i > 0; --i) { - data[i] -= data[i - 1]; - } - data[0] -= initoffset; +template void delta(const T initoffset, T *data, const size_t size) { + if (size == 0) + return; // nothing to do + if (size > 1) + for (size_t i = size - 1; i > 0; --i) { + data[i] -= data[i - 1]; + } + data[0] -= initoffset; } -template -void delta(const T initoffset, T *data) { - if (size == 0) return; // nothing to do - if (size > 1) - for (size_t i = size - 1; i > 0; --i) { - data[i] -= data[i - 1]; - } - data[0] -= initoffset; +template void delta(const T initoffset, T *data) { + if (size == 0) + return; // nothing to do + if (size > 1) + for (size_t i = size - 1; i > 0; --i) { + data[i] -= data[i - 1]; + } + data[0] -= initoffset; } - -template +template void inverseDelta(const T initoffset, T *data, const size_t size) { - if (size == 0) return; // nothing to do - data[0] += initoffset; - const size_t UnrollQty = 4; - const size_t sz0 = (size / UnrollQty) * UnrollQty; // equal to 0, if size < UnrollQty - size_t i = 1; - if (sz0 >= UnrollQty) { - T a = data[0]; - for (; i < sz0 - UnrollQty; i += UnrollQty) { - a = data[i] += a; - a = data[i + 1] += a; - a = data[i + 2] += a; - a = data[i + 3] += a; - } - } - for (; i != size; ++i) { - data[i] += data[i - 1]; + if (size == 0) + return; // nothing to do + data[0] += initoffset; + const size_t UnrollQty = 4; + const size_t sz0 = + (size / UnrollQty) * UnrollQty; // equal to 0, if size < UnrollQty + size_t i = 1; + if (sz0 >= UnrollQty) { + T a = data[0]; + for (; i < sz0 - UnrollQty; i += UnrollQty) { + a = data[i] += a; + a = data[i + 1] += a; + a = data[i + 2] += a; + a = data[i + 3] += a; } + } + for (; i != size; ++i) { + data[i] += data[i - 1]; + } } -template -void inverseDelta(const T initoffset, T *data) { - if (size == 0) return; // nothing to do - data[0] += initoffset; - const size_t UnrollQty = 4; - const size_t sz0 = (size / UnrollQty) * UnrollQty; // equal to 0, if size < UnrollQty - size_t i = 1; - if (sz0 >= UnrollQty) { - T a = data[0]; - for (; i < sz0 - UnrollQty; i += UnrollQty) { - a = data[i] += a; - a = data[i + 1] += a; - a = data[i + 2] += a; - a = data[i + 3] += a; - } - } - for (; i != size; ++i) { - data[i] += data[i - 1]; +template void inverseDelta(const T initoffset, T *data) { + if (size == 0) + return; // nothing to do + data[0] += initoffset; + const size_t UnrollQty = 4; + const size_t sz0 = + (size / UnrollQty) * UnrollQty; // equal to 0, if size < UnrollQty + size_t i = 1; + if (sz0 >= UnrollQty) { + T a = data[0]; + for (; i < sz0 - UnrollQty; i += UnrollQty) { + a = data[i] += a; + a = data[i + 1] += a; + a = data[i + 2] += a; + a = data[i + 3] += a; } + } + for (; i != size; ++i) { + data[i] += data[i - 1]; + } } - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_DELTA_H_ */ diff --git a/include/deltatemplates.h b/include/deltatemplates.h index f48cb84..160c36d 100644 --- a/include/deltatemplates.h +++ b/include/deltatemplates.h @@ -18,150 +18,146 @@ namespace SIMDCompressionLib { */ /** - * The structs RegularDeltaSIMD, NoDelta, CoarseDelta4SIMD, CoarseDelta2SIMD, Max4DeltaSIMD - * are used in templates to specify which type of differential encoding to use (if any). + * The structs RegularDeltaSIMD, NoDelta, CoarseDelta4SIMD, CoarseDelta2SIMD, + * Max4DeltaSIMD + * are used in templates to specify which type of differential encoding to use + * (if any). * * See SIMDDeltaProcessor */ struct RegularDeltaSIMD { - // Folklore code, unknown origin of this idea - ALWAYS_INLINE - static __m128i PrefixSum(__m128i curr, __m128i prev) { - const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); - const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); - return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); - } - - ALWAYS_INLINE - static __m128i Delta(__m128i curr, __m128i prev) { - return _mm_sub_epi32(curr, _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); - } - - static bool usesDifferentialEncoding() { return true; } - - static std::string name() { return "Delta1"; } + // Folklore code, unknown origin of this idea + ALWAYS_INLINE + static __m128i PrefixSum(__m128i curr, __m128i prev) { + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); + return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); + } + + ALWAYS_INLINE + static __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32( + curr, _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); + } + + static bool usesDifferentialEncoding() { return true; } + + static std::string name() { return "Delta1"; } }; struct NoDelta { - ALWAYS_INLINE - static __m128i PrefixSum(__m128i curr, __m128i) { - return curr; - } - ALWAYS_INLINE - static __m128i Delta(__m128i curr, __m128i) { - return curr; - } + ALWAYS_INLINE + static __m128i PrefixSum(__m128i curr, __m128i) { return curr; } + ALWAYS_INLINE + static __m128i Delta(__m128i curr, __m128i) { return curr; } - static bool usesDifferentialEncoding() { return false; } - static std::string name() { return "NoDelta"; } + static bool usesDifferentialEncoding() { return false; } + static std::string name() { return "NoDelta"; } }; struct CoarseDelta4SIMD { - ALWAYS_INLINE - // Proposed and implemented by L. Boytosv - static __m128i PrefixSum(__m128i curr, __m128i prev) { - return _mm_add_epi32(curr, prev); - } - ALWAYS_INLINE - static __m128i Delta(__m128i curr, __m128i prev) { - return _mm_sub_epi32(curr, prev); - } - - static bool usesDifferentialEncoding() { return true; } - - static std::string name() { return "Delta4"; } + ALWAYS_INLINE + // Proposed and implemented by L. Boytosv + static __m128i PrefixSum(__m128i curr, __m128i prev) { + return _mm_add_epi32(curr, prev); + } + ALWAYS_INLINE + static __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, prev); + } + + static bool usesDifferentialEncoding() { return true; } + + static std::string name() { return "Delta4"; } }; struct CoarseDelta2SIMD { - ALWAYS_INLINE - // Proposed and implemented by L. Boytosv - static __m128i PrefixSum(__m128i curr, __m128i prev) { - const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); - return _mm_add_epi32(_tmp1, _mm_shuffle_epi32(prev, _MM_SHUFFLE(3, 2, 3, 2))); - } - ALWAYS_INLINE - static __m128i Delta(__m128i curr, __m128i prev) { - return _mm_sub_epi32(curr, _mm_or_si128(_mm_slli_si128(curr, 8), _mm_srli_si128(prev, 8))); - } - static bool usesDifferentialEncoding() { return true; } - - static std::string name() { return "Delta2"; } + ALWAYS_INLINE + // Proposed and implemented by L. Boytosv + static __m128i PrefixSum(__m128i curr, __m128i prev) { + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); + return _mm_add_epi32(_tmp1, + _mm_shuffle_epi32(prev, _MM_SHUFFLE(3, 2, 3, 2))); + } + ALWAYS_INLINE + static __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32( + curr, _mm_or_si128(_mm_slli_si128(curr, 8), _mm_srli_si128(prev, 8))); + } + static bool usesDifferentialEncoding() { return true; } + + static std::string name() { return "Delta2"; } }; struct Max4DeltaSIMD { - ALWAYS_INLINE - // The idea is due to N. Kurz - static __m128i PrefixSum(__m128i curr, __m128i prev) { - return _mm_add_epi32(curr, _mm_shuffle_epi32(prev, 0xff)); - } - ALWAYS_INLINE - static __m128i Delta(__m128i curr, __m128i prev) { - return _mm_sub_epi32(curr, _mm_shuffle_epi32(prev, 0xff)); - } - static std::string name() { return "DeltaM4"; } - - static bool usesDifferentialEncoding() { return true; } - + ALWAYS_INLINE + // The idea is due to N. Kurz + static __m128i PrefixSum(__m128i curr, __m128i prev) { + return _mm_add_epi32(curr, _mm_shuffle_epi32(prev, 0xff)); + } + ALWAYS_INLINE + static __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, _mm_shuffle_epi32(prev, 0xff)); + } + static std::string name() { return "DeltaM4"; } + + static bool usesDifferentialEncoding() { return true; } }; - - /** - * Wrapper around the structs RegularDeltaSIMD, NoDelta, CoarseDelta4SIMD, CoarseDelta2SIMD, Max4DeltaSIMD + * Wrapper around the structs RegularDeltaSIMD, NoDelta, CoarseDelta4SIMD, + * CoarseDelta2SIMD, Max4DeltaSIMD * to compute differential encoding and prefix sums. */ -template -struct SIMDDeltaProcessor { - static __m128i runPrefixSum(__m128i initOffset, uint32_t *pData) { - const size_t QtyDivBy4 = TotalQty / 4; - // The block should contain 8N 32-bit integers, where N is some integer - assert(QtyDivBy4 % 2 == 0); - - __m128i *pCurr = reinterpret_cast<__m128i *>(pData); - const __m128i *pEnd = pCurr + QtyDivBy4; - - // Leonid Boytsov: manual loop unrolling may be crucial here. - while (pCurr < pEnd) { - initOffset = DeltaHelper::PrefixSum(MM_LOAD_SI_128(pCurr), initOffset); - MM_STORE_SI_128(pCurr++, initOffset); - - initOffset = DeltaHelper::PrefixSum(MM_LOAD_SI_128(pCurr), initOffset); - MM_STORE_SI_128(pCurr++, initOffset); - } - - return initOffset; +template struct SIMDDeltaProcessor { + static __m128i runPrefixSum(__m128i initOffset, uint32_t *pData) { + const size_t QtyDivBy4 = TotalQty / 4; + // The block should contain 8N 32-bit integers, where N is some integer + assert(QtyDivBy4 % 2 == 0); + + __m128i *pCurr = reinterpret_cast<__m128i *>(pData); + const __m128i *pEnd = pCurr + QtyDivBy4; + + // Leonid Boytsov: manual loop unrolling may be crucial here. + while (pCurr < pEnd) { + initOffset = DeltaHelper::PrefixSum(MM_LOAD_SI_128(pCurr), initOffset); + MM_STORE_SI_128(pCurr++, initOffset); + + initOffset = DeltaHelper::PrefixSum(MM_LOAD_SI_128(pCurr), initOffset); + MM_STORE_SI_128(pCurr++, initOffset); } - static void runDelta(__m128i initOffset, uint32_t *pData) { - const size_t QtyDivBy4 = TotalQty / 4; - // The block should contain 8N 32-bit integers, where N is some integer - assert(QtyDivBy4 && QtyDivBy4 % 2 == 0); - __m128i *pCurr = reinterpret_cast<__m128i *>(pData) + QtyDivBy4 - 1; - __m128i *pStart = reinterpret_cast<__m128i *>(pData); - __m128i a = MM_LOAD_SI_128(pCurr); - // Leonid Boytsov: manual loop unrolling may be crucial here. - while (pCurr > pStart + 1) { - __m128i b = MM_LOAD_SI_128(pCurr - 1); - MM_STORE_SI_128(pCurr, DeltaHelper::Delta(a, b)); - a = b; - --pCurr; - - b = MM_LOAD_SI_128(pCurr - 1); - MM_STORE_SI_128(pCurr, DeltaHelper::Delta(a, b)); - a = b; - --pCurr; - } - - __m128i b = MM_LOAD_SI_128(pStart); - MM_STORE_SI_128(pStart + 1, DeltaHelper::Delta(a, b)); - a = b; - - MM_STORE_SI_128(pStart , DeltaHelper::Delta(a, initOffset)); + return initOffset; + } + + static void runDelta(__m128i initOffset, uint32_t *pData) { + const size_t QtyDivBy4 = TotalQty / 4; + // The block should contain 8N 32-bit integers, where N is some integer + assert(QtyDivBy4 && QtyDivBy4 % 2 == 0); + __m128i *pCurr = reinterpret_cast<__m128i *>(pData) + QtyDivBy4 - 1; + __m128i *pStart = reinterpret_cast<__m128i *>(pData); + __m128i a = MM_LOAD_SI_128(pCurr); + // Leonid Boytsov: manual loop unrolling may be crucial here. + while (pCurr > pStart + 1) { + __m128i b = MM_LOAD_SI_128(pCurr - 1); + MM_STORE_SI_128(pCurr, DeltaHelper::Delta(a, b)); + a = b; + --pCurr; + + b = MM_LOAD_SI_128(pCurr - 1); + MM_STORE_SI_128(pCurr, DeltaHelper::Delta(a, b)); + a = b; + --pCurr; } -}; + __m128i b = MM_LOAD_SI_128(pStart); + MM_STORE_SI_128(pStart + 1, DeltaHelper::Delta(a, b)); + a = b; + MM_STORE_SI_128(pStart, DeltaHelper::Delta(a, initOffset)); + } +}; } // namespace SIMDCompressionLib diff --git a/include/fastpfor.h b/include/fastpfor.h index 16c0852..5223805 100644 --- a/include/fastpfor.h +++ b/include/fastpfor.h @@ -7,7 +7,6 @@ #ifndef SIMDCompressionAndIntersection_FASTPFOR_H_ #define SIMDCompressionAndIntersection_FASTPFOR_H_ - #include "common.h" #include "codecs.h" #include "bitpackinghelpers.h" @@ -16,370 +15,346 @@ namespace SIMDCompressionLib { - - class ScalarSortedBitPacker { public: - - enum {DEFAULTSIZE = 128}; - uint32_t buffer[32]; - - ScalarSortedBitPacker() { - for (uint32_t i = 0; i < 32; ++i) { - data[i] = new uint32_t[DEFAULTSIZE]; - memset(data[i], 0, DEFAULTSIZE * sizeof(uint32_t)); - actualsizes[i] = DEFAULTSIZE; - } - clear(); - } - - void reset() { - for (uint32_t i = 0; i < 32; ++i) { - delete[] data[i]; - data[i] = new uint32_t[DEFAULTSIZE]; - memset(data[i], 0, DEFAULTSIZE * sizeof(uint32_t)); - actualsizes[i] = DEFAULTSIZE; - } - clear(); + enum { DEFAULTSIZE = 128 }; + uint32_t buffer[32]; + + ScalarSortedBitPacker() { + for (uint32_t i = 0; i < 32; ++i) { + data[i] = new uint32_t[DEFAULTSIZE]; + memset(data[i], 0, DEFAULTSIZE * sizeof(uint32_t)); + actualsizes[i] = DEFAULTSIZE; } - - ~ScalarSortedBitPacker() { - free(); + clear(); + } + + void reset() { + for (uint32_t i = 0; i < 32; ++i) { + delete[] data[i]; + data[i] = new uint32_t[DEFAULTSIZE]; + memset(data[i], 0, DEFAULTSIZE * sizeof(uint32_t)); + actualsizes[i] = DEFAULTSIZE; } - void free() { - clear(); - for (uint32_t i = 0; i < 32; ++i) - if (data[i] != NULL) { - delete[] data[i]; - data[i] = NULL; - actualsizes[i] = 0; - } + clear(); + } + + ~ScalarSortedBitPacker() { free(); } + void free() { + clear(); + for (uint32_t i = 0; i < 32; ++i) + if (data[i] != NULL) { + delete[] data[i]; + data[i] = NULL; + actualsizes[i] = 0; + } + } + void directAppend(uint32_t i, uint32_t val) { data[i][sizes[i]++] = val; } + + const uint32_t *get(int i) { return data[i]; } + + void ensureCapacity(int i, uint32_t datatoadd) { + if (sizes[i] + datatoadd > actualsizes[i]) { + actualsizes[i] = (sizes[i] + datatoadd + 127) / 128 * 128 * 2; + uint32_t *tmp = new uint32_t[actualsizes[i]]; + for (uint32_t j = 0; j < sizes[i]; ++j) + tmp[j] = data[i][j]; + delete[] data[i]; + data[i] = tmp; } - void directAppend(uint32_t i, uint32_t val) { - data[i][sizes[i]++] = val; + } + + void clear() { + for (uint32_t i = 0; i < 32; ++i) + sizes[i] = 0; // memset "might" be faster. + } + + uint32_t *write(uint32_t *out) { + uint32_t bitmap = 0; + for (uint32_t k = 1; k < 32; ++k) { + if (sizes[k] != 0) + bitmap |= (1U << k); } - - const uint32_t *get(int i) { - return data[i]; - } - - void ensureCapacity(int i, uint32_t datatoadd) { - if (sizes[i] + datatoadd > actualsizes[i]) { - actualsizes[i] = (sizes[i] + datatoadd + 127) / 128 * 128 * 2; - uint32_t *tmp = new uint32_t[actualsizes[i]]; - for (uint32_t j = 0; j < sizes[i]; ++j) - tmp[j] = data[i][j]; - delete[] data[i]; - data[i] = tmp; + *(out++) = bitmap; + + for (uint32_t k = 1; k < 32; ++k) { + if (sizes[k] != 0) { + *out = sizes[k]; + out++; + uint32_t j = 0; + for (; j < sizes[k]; j += 32) { + BitPackingHelpers::fastpackwithoutmask(&data[k][j], out, k + 1); + out += k + 1; } + out -= (j - sizes[k]) * (k + 1) / 32; + } } - - void clear() { - for (uint32_t i = 0; i < 32; ++i) - sizes[i] = 0;// memset "might" be faster. - } - - uint32_t *write(uint32_t *out) { - uint32_t bitmap = 0; - for (uint32_t k = 1; k < 32; ++k) { - if (sizes[k] != 0) - bitmap |= (1U << k); + return out; + } + + const uint32_t *read(const uint32_t *in) { + clear(); + const uint32_t bitmap = *(in++); + + for (uint32_t k = 1; k < 32; ++k) { + if ((bitmap & (1U << k)) != 0) { + sizes[k] = *in++; + if (actualsizes[k] < sizes[k]) { + delete[] data[k]; + actualsizes[k] = (sizes[k] + 31) / 32 * 32; + data[k] = new uint32_t[actualsizes[k]]; } - *(out++) = bitmap; - - for (uint32_t k = 1; k < 32; ++k) { - if (sizes[k] != 0) { - *out = sizes[k]; - out++; - uint32_t j = 0; - for (; j < sizes[k]; j += 32) { - BitPackingHelpers::fastpackwithoutmask(&data[k][j], out, - k + 1); - out += k + 1; - } - out -= ( j - sizes[k] ) * (k+1) / 32; - - } + uint32_t j = 0; + for (; j + 31 < sizes[k]; j += 32) { + BitPackingHelpers::fastunpack(in, &data[k][j], k + 1); + in += k + 1; } - return out; + uint32_t remaining = sizes[k] - j; + memcpy(buffer, in, (remaining * (k + 1) + 31) / 32 * sizeof(uint32_t)); + uint32_t *bpointer = buffer; + in += ((sizes[k] + 31) / 32 * 32 - j) / 32 * (k + 1); + for (; j < sizes[k]; j += 32) { + BitPackingHelpers::fastunpack(bpointer, &data[k][j], k + 1); + bpointer += k + 1; + } + in -= (j - sizes[k]) * (k + 1) / 32; + } } + return in; + } - const uint32_t *read(const uint32_t *in) { - clear(); - const uint32_t bitmap = *(in++); - - for (uint32_t k = 1; k < 32; ++k) { - if ((bitmap & (1U << k)) != 0) { - sizes[k] = *in++; - if (actualsizes[k] < sizes[k]) { - delete[] data[k]; - actualsizes[k] = (sizes[k] + 31) / 32 * 32; - data[k] = new uint32_t[actualsizes[k]]; - } - uint32_t j = 0; - for (; j + 31 < sizes[k]; j += 32) { - BitPackingHelpers::fastunpack(in, &data[k][j], k + 1); - in += k + 1; - } - uint32_t remaining = sizes[k] - j; - memcpy(buffer,in,(remaining * (k + 1) + 31)/32*sizeof(uint32_t)); - uint32_t * bpointer = buffer; - in += ((sizes[k] + 31) / 32 * 32-j)/32 * (k+1); - for (; j < sizes[k]; j += 32) { - BitPackingHelpers::fastunpack(bpointer, &data[k][j], k + 1); - bpointer+= k + 1; - } - in -= ( j - sizes[k] ) * (k+1) / 32; - - } - } - return in; - - } private: - uint32_t *data[32]; - uint32_t sizes[32]; - uint32_t actualsizes[32]; - - // we don't want anyone to start copying this class - ScalarSortedBitPacker(const ScalarSortedBitPacker &); - ScalarSortedBitPacker &operator=(const ScalarSortedBitPacker &); - + uint32_t *data[32]; + uint32_t sizes[32]; + uint32_t actualsizes[32]; + // we don't want anyone to start copying this class + ScalarSortedBitPacker(const ScalarSortedBitPacker &); + ScalarSortedBitPacker &operator=(const ScalarSortedBitPacker &); }; - /** * FastPFor * * Reference and documentation: * - * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization + * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second + * through vectorization * http://arxiv.org/abs/1209.2137 * - * Designed by D. Lemire with ideas from Leonid Boytsov. This scheme is NOT patented. + * Designed by D. Lemire with ideas from Leonid Boytsov. This scheme is NOT + * patented. * */ -template//BlockSizeInUnitsOfPackSize should be 4 or 8 -class FastPFor: public IntegerCODEC { +template // BlockSizeInUnitsOfPackSize should be 4 or 8 +class FastPFor : public IntegerCODEC { public: - /** - * ps (page size) should be a multiple of BlockSize, any "large" - * value should do. - */ - FastPFor(uint32_t ps = 65536) : - PageSize(ps), bitsPageSize(gccbits(PageSize)), bpacker(), + /** + * ps (page size) should be a multiple of BlockSize, any "large" + * value should do. + */ + FastPFor(uint32_t ps = 65536) + : PageSize(ps), bitsPageSize(gccbits(PageSize)), bpacker(), bytescontainer(PageSize + 3 * PageSize / BlockSize) { - assert(ps / BlockSize * BlockSize == ps); - assert(gccbits(BlockSizeInUnitsOfPackSize * PACKSIZE - 1) <= 8); + assert(ps / BlockSize * BlockSize == ps); + assert(gccbits(BlockSizeInUnitsOfPackSize * PACKSIZE - 1) <= 8); + } + enum { + PACKSIZE = 32, + overheadofeachexcept = 8, + overheadduetobits = 8, + overheadduetonmbrexcept = 8, + BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE + }; + + const uint32_t PageSize; + const uint32_t bitsPageSize; + ScalarSortedBitPacker bpacker; + vector bytescontainer; + + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + const uint32_t *const initin(in); + const size_t mynvalue = *in; + ++in; + if (mynvalue > nvalue) + throw NotEnoughStorage(mynvalue); + nvalue = mynvalue; + const uint32_t *const finalout(out + nvalue); + uint32_t prev = 0; + while (out != finalout) { + size_t thisnvalue(0); + size_t thissize = static_cast( + finalout > PageSize + out ? PageSize : (finalout - out)); + + __decodeArray(in, thisnvalue, out, thissize, prev); + in += thisnvalue; + out += thissize; } - enum { - PACKSIZE = 32, - overheadofeachexcept = 8, - overheadduetobits = 8, - overheadduetonmbrexcept = 8, - BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE - }; - - - - const uint32_t PageSize; - const uint32_t bitsPageSize; - ScalarSortedBitPacker bpacker; - vector bytescontainer; - - const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) { - const uint32_t *const initin(in); - const size_t mynvalue = *in; - ++in; - if (mynvalue > nvalue) - throw NotEnoughStorage(mynvalue); - nvalue = mynvalue; - const uint32_t *const finalout(out + nvalue); - uint32_t prev = 0; - while (out != finalout) { - size_t thisnvalue(0); - size_t thissize = - static_cast(finalout > PageSize + out ? PageSize - : (finalout - out)); - - __decodeArray(in, thisnvalue, out, thissize, prev); - in += thisnvalue; - out += thissize; - } - assert(initin + length >= in); - bpacker.reset();// if you don't do this, the codec has a "memory". - return in; + assert(initin + length >= in); + bpacker.reset(); // if you don't do this, the codec has a "memory". + return in; + } + + /** + * If you save the output and recover it in memory, you are + * responsible to ensure that the alignment is preserved. + * + * The input size (length) should be a multiple of + * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done + * to simplify slightly the implementation.) + */ + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + checkifdivisibleby(length, BlockSize); + const uint32_t *const initout(out); + const uint32_t *const finalin(in + length); + + *out++ = static_cast(length); + const size_t oldnvalue = nvalue; + nvalue = 1; + uint32_t prev = 0; + while (in != finalin) { + size_t thissize = static_cast( + finalin > PageSize + in ? PageSize : (finalin - in)); + size_t thisnvalue(0); + __encodeArray(in, thissize, out, thisnvalue, prev); + nvalue += thisnvalue; + out += thisnvalue; + in += thissize; } - - /** - * If you save the output and recover it in memory, you are - * responsible to ensure that the alignment is preserved. - * - * The input size (length) should be a multiple of - * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done - * to simplify slightly the implementation.) - */ - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - checkifdivisibleby(length, BlockSize); - const uint32_t *const initout(out); - const uint32_t *const finalin(in + length); - - *out++ = static_cast(length); - const size_t oldnvalue = nvalue; - nvalue = 1; - uint32_t prev = 0; - while (in != finalin) { - size_t thissize = - static_cast(finalin > PageSize + in ? PageSize - : (finalin - in)); - size_t thisnvalue(0); - __encodeArray(in, thissize, out, thisnvalue, prev); - nvalue += thisnvalue; - out += thisnvalue; - in += thissize; - } - assert(out == nvalue + initout); - if (oldnvalue < nvalue) - cerr << "It is possible we have a buffer overrun. " << endl; - bpacker.reset();// if you don't do this, the buffer has a memory + assert(out == nvalue + initout); + if (oldnvalue < nvalue) + cerr << "It is possible we have a buffer overrun. " << endl; + bpacker.reset(); // if you don't do this, the buffer has a memory + } + + void getBestBFromData(const uint32_t *in, uint8_t &bestb, + uint8_t &bestcexcept, uint8_t &maxb) { + uint32_t freqs[33]; + for (uint32_t k = 0; k <= 32; ++k) + freqs[k] = 0; + for (uint32_t k = 0; k < BlockSize; ++k) { + freqs[asmbits(in[k])]++; } - - - void getBestBFromData(const uint32_t *in, uint8_t &bestb, - uint8_t &bestcexcept, uint8_t &maxb) { - uint32_t freqs[33]; - for (uint32_t k = 0; k <= 32; ++k) - freqs[k] = 0; - for (uint32_t k = 0; k < BlockSize; ++k) { - freqs[asmbits(in[k])]++; - } - bestb = 32; - while (freqs[bestb] == 0) - bestb--; - maxb = bestb; - uint32_t bestcost = bestb * BlockSize; - uint32_t cexcept = 0; + bestb = 32; + while (freqs[bestb] == 0) + bestb--; + maxb = bestb; + uint32_t bestcost = bestb * BlockSize; + uint32_t cexcept = 0; + bestcexcept = static_cast(cexcept); + for (uint32_t b = bestb - 1; b < 32; --b) { + cexcept += freqs[b + 1]; + uint32_t thiscost = cexcept * overheadofeachexcept + + cexcept * (maxb - b) + b * BlockSize + + 8; // the extra 8 is the cost of storing maxbits + if (bestb - b == 1) + thiscost -= cexcept; + if (thiscost < bestcost) { + bestcost = thiscost; + bestb = static_cast(b); bestcexcept = static_cast(cexcept); - for (uint32_t b = bestb - 1; b < 32; --b) { - cexcept += freqs[b + 1]; - uint32_t thiscost = cexcept * overheadofeachexcept + cexcept - * (maxb - b) + b * BlockSize + 8;// the extra 8 is the cost of storing maxbits - if(bestb - b == 1) thiscost -= cexcept; - if (thiscost < bestcost) { - bestcost = thiscost; - bestb = static_cast(b); - bestcexcept = static_cast(cexcept); - } - } + } } - - void __encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue, uint32_t &prev) { - uint32_t *const initout = out; // keep track of this - checkifdivisibleby(length, BlockSize); - uint32_t *const headerout = out++; // keep track of this - bpacker.clear(); - uint8_t *bc = bytescontainer.data(); - for (const uint32_t *const final = in + length; (in + BlockSize - <= final); in += BlockSize) { - uint8_t bestb, bestcexcept, maxb; - if (useDelta) { - uint32_t nextprev = in[BlockSize - 1]; - delta(prev, in, BlockSize); - prev = nextprev; - } - getBestBFromData(in, bestb, bestcexcept, maxb); - *bc++ = bestb; - *bc++ = bestcexcept; - if (bestcexcept > 0) { - *bc++ = maxb; - bpacker.ensureCapacity(maxb - bestb - 1, bestcexcept); - const uint32_t maxval = 1U << bestb; - for (uint32_t k = 0; k < BlockSize; ++k) { - if (in[k] >= maxval) { - bpacker.directAppend(maxb - bestb - 1, in[k] >> bestb); - *bc++ = static_cast(k); - } - } - } - for (size_t k = 0; k < BlockSizeInUnitsOfPackSize; ++k) { - BitPackingHelpers::fastpack(in + k * 32, out + k * bestb, bestb); - } - out += BlockSizeInUnitsOfPackSize * bestb; + } + + void __encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue, uint32_t &prev) { + uint32_t *const initout = out; // keep track of this + checkifdivisibleby(length, BlockSize); + uint32_t *const headerout = out++; // keep track of this + bpacker.clear(); + uint8_t *bc = bytescontainer.data(); + for (const uint32_t *const final = in + length; (in + BlockSize <= final); + in += BlockSize) { + uint8_t bestb, bestcexcept, maxb; + if (useDelta) { + uint32_t nextprev = in[BlockSize - 1]; + delta(prev, in, BlockSize); + prev = nextprev; + } + getBestBFromData(in, bestb, bestcexcept, maxb); + *bc++ = bestb; + *bc++ = bestcexcept; + if (bestcexcept > 0) { + *bc++ = maxb; + bpacker.ensureCapacity(maxb - bestb - 1, bestcexcept); + const uint32_t maxval = 1U << bestb; + for (uint32_t k = 0; k < BlockSize; ++k) { + if (in[k] >= maxval) { + bpacker.directAppend(maxb - bestb - 1, in[k] >> bestb); + *bc++ = static_cast(k); + } } - headerout[0] = static_cast(out - headerout); - const uint32_t bytescontainersize = static_cast(bc - bytescontainer.data()); - *(out++) = bytescontainersize; - memcpy(out, bytescontainer.data(), bytescontainersize); - out += (bytescontainersize + sizeof(uint32_t) - 1) - / sizeof(uint32_t); - const uint32_t *const lastout = bpacker.write(out); - nvalue = lastout - initout; + } + for (size_t k = 0; k < BlockSizeInUnitsOfPackSize; ++k) { + BitPackingHelpers::fastpack(in + k * 32, out + k * bestb, bestb); + } + out += BlockSizeInUnitsOfPackSize * bestb; } - - void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out, - const size_t nvalue, uint32_t &prev) { - const uint32_t *const initin = in; - const uint32_t *const headerin = in++; - const uint32_t wheremeta = headerin[0]; - const uint32_t *inexcept = headerin + wheremeta; - const uint32_t bytesize = *inexcept++; - const uint8_t *bytep = reinterpret_cast(inexcept); - - inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t); - inexcept = bpacker.read(inexcept); - length = inexcept - initin; - const uint32_t *unpackpointers[32 + 1]; - for (uint32_t k = 1; k <= 32; ++k) { - unpackpointers[k] = bpacker.get(k - 1); - } - for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out - += BlockSize) { - const uint8_t b = *bytep++; - const uint8_t cexcept = *bytep++; - for (size_t k = 0; k < BlockSizeInUnitsOfPackSize; ++k) { - BitPackingHelpers::fastunpack(in + k * b, out + k * 32, b); - } - in += BlockSizeInUnitsOfPackSize * b; - if (cexcept > 0) { - const uint8_t maxbits = *bytep++; - if (maxbits - b == 1) { - for (uint32_t k = 0; k < cexcept; ++k) { - const uint8_t pos = *(bytep++); - out[pos] |= static_cast(1) << b; - } - } else { - const uint32_t *vals = unpackpointers[maxbits - b]; - unpackpointers[maxbits - b] += cexcept; - for (uint32_t k = 0; k < cexcept; ++k) { - const uint8_t pos = *(bytep++); - out[pos] |= vals[k] << b; - } - } - } - if (useDelta) { - inverseDelta(prev, out, BlockSize); - prev = out[BlockSize - 1]; - } + headerout[0] = static_cast(out - headerout); + const uint32_t bytescontainersize = + static_cast(bc - bytescontainer.data()); + *(out++) = bytescontainersize; + memcpy(out, bytescontainer.data(), bytescontainersize); + out += (bytescontainersize + sizeof(uint32_t) - 1) / sizeof(uint32_t); + const uint32_t *const lastout = bpacker.write(out); + nvalue = lastout - initout; + } + + void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out, + const size_t nvalue, uint32_t &prev) { + const uint32_t *const initin = in; + const uint32_t *const headerin = in++; + const uint32_t wheremeta = headerin[0]; + const uint32_t *inexcept = headerin + wheremeta; + const uint32_t bytesize = *inexcept++; + const uint8_t *bytep = reinterpret_cast(inexcept); + + inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t); + inexcept = bpacker.read(inexcept); + length = inexcept - initin; + const uint32_t *unpackpointers[32 + 1]; + for (uint32_t k = 1; k <= 32; ++k) { + unpackpointers[k] = bpacker.get(k - 1); + } + for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out += BlockSize) { + const uint8_t b = *bytep++; + const uint8_t cexcept = *bytep++; + for (size_t k = 0; k < BlockSizeInUnitsOfPackSize; ++k) { + BitPackingHelpers::fastunpack(in + k * b, out + k * 32, b); + } + in += BlockSizeInUnitsOfPackSize * b; + if (cexcept > 0) { + const uint8_t maxbits = *bytep++; + if (maxbits - b == 1) { + for (uint32_t k = 0; k < cexcept; ++k) { + const uint8_t pos = *(bytep++); + out[pos] |= static_cast(1) << b; + } + } else { + const uint32_t *vals = unpackpointers[maxbits - b]; + unpackpointers[maxbits - b] += cexcept; + for (uint32_t k = 0; k < cexcept; ++k) { + const uint8_t pos = *(bytep++); + out[pos] |= vals[k] << b; + } } - - assert(in == headerin + wheremeta); + } + if (useDelta) { + inverseDelta(prev, out, BlockSize); + prev = out[BlockSize - 1]; + } } - string name() const { - return string("FastPFor") + (useDelta ? "Delta" : ""); - } + assert(in == headerin + wheremeta); + } + string name() const { return string("FastPFor") + (useDelta ? "Delta" : ""); } }; - - - - - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_FASTPFOR_H_ */ diff --git a/include/for.h b/include/for.h index b26ec0d..cc989f6 100644 --- a/include/for.h +++ b/include/for.h @@ -1,293 +1,285 @@ -/* - * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * A fast implementation for Frame of Reference encoding. - * - * See the README.md file for more information, example code and references. - * - * Feel free to send comments/questions to chris@crupp.de. I am available - * for consulting. - */ - -#ifndef FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e -#define FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * Returns the size required to compress a sequence of |length| ints, - * each compressed with |bits| bits - * - * This function will NOT include any overhead required by - * for_compress_sorted() and for_compress_unsorted(). - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_compressed_size_bits(uint32_t length, uint32_t bits); - -/** - * Returns the size required to compress an unsorted sequence of |length| ints. - * - * This routine scans |in| for the min/max values and then calls - * for_compressed_size_bits(). - * - * The returned size will include the overhead required for - * for_compress_sorted() and for_compressed_unsorted(). - */ -extern uint32_t -for_compressed_size_unsorted(const uint32_t *in, uint32_t length); - -/** - * Returns the size required to compress a sorted sequence of |length| ints. - * - * This routine extracts min/max values at the beginning and end of - * the sequence, then calls for_compressed_size_bits(). It is therefore - * slightly faster than for_compressed_size_unsorted(). - * - * The returned size will include the overhead required for - * for_compress_sorted() and for_compressed_unsorted(). - */ -extern uint32_t -for_compressed_size_sorted(const uint32_t *in, uint32_t length); - -/** - * Compresses a sequence of |length| ints at |in| and stores the result - * in |out|. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * |bits| are the bits required to store a single integer. - * - * Returns the number of bytes used for compression. - * - * This is for advanced users who opt for storing |base| and |bits| on their - * own. This function is called by for_compress_sorted() and - * for_compress_unsorted(). - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_compress_bits(const uint32_t *in, uint8_t *out, uint32_t length, - uint32_t base, uint32_t bits); - -/** - * Compresses an unsorted sequence of |length| ints at |in| and stores the - * result in |out|. - * - * This routine scans |in| for the min/max values and then calls - * for_compress_bits(). - * - * The minimun value and the bits are stored as metadata in |out|. - */ -extern uint32_t -for_compress_unsorted(const uint32_t *in, uint8_t *out, uint32_t length); - -/** - * Compresses a sorted sequence of |length| ints at |in| and stores the - * result in |out|. - * - * This routine extracts min/max values at the beginning and end of - * the sequence, then calls for_compress_bits(). - * - * The minimun value and the bits are stored as metadata in |out|. - */ -extern uint32_t -for_compress_sorted(const uint32_t *in, uint8_t *out, uint32_t length); - -/** - * Uncompresses a sequence of |length| ints at |in| and stores the - * result in |out|. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * |bits| are the bits required to store a single integer. - * - * Returns the number of compressed bytes processed. - * - * This function is for advanced users. It is the counterpart of - * for_compress_bits(). - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_uncompress_bits(const uint8_t *in, uint32_t *out, uint32_t length, - uint32_t base, uint32_t bits); - -/** - * Uncompresses a sequence of |length| ints at |in| and stores the - * result in |out|. - * - * This function is a convenience wrapper for for_uncompress_bits(). It - * expects metadata at the beginning of |in|. Use in combination with - * for_compress_sorted() and for_compress_unsorted(). - * - * Returns the number of compressed bytes processed. - */ -extern uint32_t -for_uncompress(const uint8_t *in, uint32_t *out, uint32_t length); - -/** - * Appends a |value| to a compressed sequence of unsorted integers. - * - * This function is optimized for appending new values at the end of an - * encoded sequence. This is only possible if the new value (more precisely: - * the delta of the new value) can be stored in the same amount of bits that - * were used to encode the other integers. - * - * If this is not the case then memory is allocated, the whole sequence is - * decoded and re-encoded using more bits. This requires a heap allocation - * with malloc(). - * - * Returns the size (in bytes) of the compressed data, or 0 if malloc() fails. - */ -extern uint32_t -for_append_unsorted(uint8_t *in, uint32_t length, uint32_t value); - -/** - * Appends a |value| to a compressed sequence of sorted integers. - * - * This function is optimized for appending new values at the end of an - * encoded sequence. This is only possible if the new value (more precisely: - * the delta of the new value) can be stored in the same amount of bits that - * were used to encode the other integers. - * - * If this is not the case then memory is allocated, the whole sequence is - * decoded and re-encoded using more bits. This requires a heap allocation - * with malloc(). - * - * Returns the size (in bytes) of the compressed data, or 0 if malloc() fails. - */ -extern uint32_t -for_append_sorted(uint8_t *in, uint32_t length, uint32_t value); - -/** - * Appends a |value| to a compressed integer sequence. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * |bits| are the bits required to store a single integer. - * - * Returns the size (in bytes) of the compressed data. - * - * Invariant: bits <= 32 - * Invariant: the new |value| (more precisely: |value - base|) can be stored - * in |bits| bits. Details can be found in the implementation of - * for_append() in for.c. - */ -extern uint32_t -for_append_bits(uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value); - -/** - * Returns the value at the given |index| from a compressed sequence. - * - * Make sure that |index| does not exceed the length of the sequence. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_select_bits(const uint8_t *in, uint32_t base, uint32_t bits, - uint32_t index); - -/** - * Returns the value at the given |index| from a compressed sequence. - * - * Make sure that |index| does not exceed the length of the sequence. - * - * This function is a convenience wrapper for for_select_bits(). It - * expects metadata at the beginning of |in|. Use in combination with - * for_compress_sorted() and for_compress_unsorted(). - */ -extern uint32_t -for_select(const uint8_t *in, uint32_t index); - -/** - * Performs a linear search for |value|. - * - * Returns the index of the found element, or |length| if the key was not - * found. - * - * This function is a convenience wrapper for for_linear_search_bits(). It - * expects metadata at the beginning of |in|. Use in combination with - * for_compress_sorted() and for_compress_unsorted(). - */ -extern uint32_t -for_linear_search(const uint8_t *in, uint32_t length, uint32_t value); - -/** - * Performs a linear search for |value|. - * - * Returns the index of the found element, or |length| if the key was not - * found. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_linear_search_bits(const uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value); - -/** - * Performs lower bound binary search search for |value|. - * - * A lower bound search returns the first element in the sequence which does - * not compare less than |value|. - * The actual result is stored in |*actual|. - * - * This function is a convenience wrapper for for_lower_bound_search_bits(). It - * expects metadata at the beginning of |in|. Use in combination with - * for_compress_sorted() and for_compress_unsorted(). - */ -extern uint32_t -for_lower_bound_search(const uint8_t *in, uint32_t length, uint32_t value, - uint32_t *actual); - -/** - * Performs lower bound binary search search for |value|. - * - * A lower bound search returns the first element in the sequence which does - * not compare less than |value|. - * The actual result is stored in |*actual|. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_lower_bound_search_bits(const uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value, uint32_t *actual); - - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e */ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A fast implementation for Frame of Reference encoding. + * + * See the README.md file for more information, example code and references. + * + * Feel free to send comments/questions to chris@crupp.de. I am available + * for consulting. + */ + +#ifndef FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e +#define FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Returns the size required to compress a sequence of |length| ints, + * each compressed with |bits| bits + * + * This function will NOT include any overhead required by + * for_compress_sorted() and for_compress_unsorted(). + * + * Invariant: bits <= 32 + */ +extern uint32_t for_compressed_size_bits(uint32_t length, uint32_t bits); + +/** + * Returns the size required to compress an unsorted sequence of |length| ints. + * + * This routine scans |in| for the min/max values and then calls + * for_compressed_size_bits(). + * + * The returned size will include the overhead required for + * for_compress_sorted() and for_compressed_unsorted(). + */ +extern uint32_t for_compressed_size_unsorted(const uint32_t *in, + uint32_t length); + +/** + * Returns the size required to compress a sorted sequence of |length| ints. + * + * This routine extracts min/max values at the beginning and end of + * the sequence, then calls for_compressed_size_bits(). It is therefore + * slightly faster than for_compressed_size_unsorted(). + * + * The returned size will include the overhead required for + * for_compress_sorted() and for_compressed_unsorted(). + */ +extern uint32_t for_compressed_size_sorted(const uint32_t *in, uint32_t length); + +/** + * Compresses a sequence of |length| ints at |in| and stores the result + * in |out|. + * + * |base| is the "offset" (or common delta value) of all ints. It is usually + * set to the minimum value of the uncompressed sequence. + * + * |bits| are the bits required to store a single integer. + * + * Returns the number of bytes used for compression. + * + * This is for advanced users who opt for storing |base| and |bits| on their + * own. This function is called by for_compress_sorted() and + * for_compress_unsorted(). + * + * Invariant: bits <= 32 + */ +extern uint32_t for_compress_bits(const uint32_t *in, uint8_t *out, + uint32_t length, uint32_t base, + uint32_t bits); + +/** + * Compresses an unsorted sequence of |length| ints at |in| and stores the + * result in |out|. + * + * This routine scans |in| for the min/max values and then calls + * for_compress_bits(). + * + * The minimun value and the bits are stored as metadata in |out|. + */ +extern uint32_t for_compress_unsorted(const uint32_t *in, uint8_t *out, + uint32_t length); + +/** + * Compresses a sorted sequence of |length| ints at |in| and stores the + * result in |out|. + * + * This routine extracts min/max values at the beginning and end of + * the sequence, then calls for_compress_bits(). + * + * The minimun value and the bits are stored as metadata in |out|. + */ +extern uint32_t for_compress_sorted(const uint32_t *in, uint8_t *out, + uint32_t length); + +/** + * Uncompresses a sequence of |length| ints at |in| and stores the + * result in |out|. + * + * |base| is the "offset" (or common delta value) of all ints. It is usually + * set to the minimum value of the uncompressed sequence. + * + * |bits| are the bits required to store a single integer. + * + * Returns the number of compressed bytes processed. + * + * This function is for advanced users. It is the counterpart of + * for_compress_bits(). + * + * Invariant: bits <= 32 + */ +extern uint32_t for_uncompress_bits(const uint8_t *in, uint32_t *out, + uint32_t length, uint32_t base, + uint32_t bits); + +/** + * Uncompresses a sequence of |length| ints at |in| and stores the + * result in |out|. + * + * This function is a convenience wrapper for for_uncompress_bits(). It + * expects metadata at the beginning of |in|. Use in combination with + * for_compress_sorted() and for_compress_unsorted(). + * + * Returns the number of compressed bytes processed. + */ +extern uint32_t for_uncompress(const uint8_t *in, uint32_t *out, + uint32_t length); + +/** + * Appends a |value| to a compressed sequence of unsorted integers. + * + * This function is optimized for appending new values at the end of an + * encoded sequence. This is only possible if the new value (more precisely: + * the delta of the new value) can be stored in the same amount of bits that + * were used to encode the other integers. + * + * If this is not the case then memory is allocated, the whole sequence is + * decoded and re-encoded using more bits. This requires a heap allocation + * with malloc(). + * + * Returns the size (in bytes) of the compressed data, or 0 if malloc() fails. + */ +extern uint32_t for_append_unsorted(uint8_t *in, uint32_t length, + uint32_t value); + +/** + * Appends a |value| to a compressed sequence of sorted integers. + * + * This function is optimized for appending new values at the end of an + * encoded sequence. This is only possible if the new value (more precisely: + * the delta of the new value) can be stored in the same amount of bits that + * were used to encode the other integers. + * + * If this is not the case then memory is allocated, the whole sequence is + * decoded and re-encoded using more bits. This requires a heap allocation + * with malloc(). + * + * Returns the size (in bytes) of the compressed data, or 0 if malloc() fails. + */ +extern uint32_t for_append_sorted(uint8_t *in, uint32_t length, uint32_t value); + +/** + * Appends a |value| to a compressed integer sequence. + * + * |base| is the "offset" (or common delta value) of all ints. It is usually + * set to the minimum value of the uncompressed sequence. + * + * |bits| are the bits required to store a single integer. + * + * Returns the size (in bytes) of the compressed data. + * + * Invariant: bits <= 32 + * Invariant: the new |value| (more precisely: |value - base|) can be stored + * in |bits| bits. Details can be found in the implementation of + * for_append() in for.c. + */ +extern uint32_t for_append_bits(uint8_t *in, uint32_t length, uint32_t base, + uint32_t bits, uint32_t value); + +/** + * Returns the value at the given |index| from a compressed sequence. + * + * Make sure that |index| does not exceed the length of the sequence. + * + * |base| is the "offset" (or common delta value) of all ints. It is usually + * set to the minimum value of the uncompressed sequence. + * + * Invariant: bits <= 32 + */ +extern uint32_t for_select_bits(const uint8_t *in, uint32_t base, uint32_t bits, + uint32_t index); + +/** + * Returns the value at the given |index| from a compressed sequence. + * + * Make sure that |index| does not exceed the length of the sequence. + * + * This function is a convenience wrapper for for_select_bits(). It + * expects metadata at the beginning of |in|. Use in combination with + * for_compress_sorted() and for_compress_unsorted(). + */ +extern uint32_t for_select(const uint8_t *in, uint32_t index); + +/** + * Performs a linear search for |value|. + * + * Returns the index of the found element, or |length| if the key was not + * found. + * + * This function is a convenience wrapper for for_linear_search_bits(). It + * expects metadata at the beginning of |in|. Use in combination with + * for_compress_sorted() and for_compress_unsorted(). + */ +extern uint32_t for_linear_search(const uint8_t *in, uint32_t length, + uint32_t value); + +/** + * Performs a linear search for |value|. + * + * Returns the index of the found element, or |length| if the key was not + * found. + * + * |base| is the "offset" (or common delta value) of all ints. It is usually + * set to the minimum value of the uncompressed sequence. + * + * Invariant: bits <= 32 + */ +extern uint32_t for_linear_search_bits(const uint8_t *in, uint32_t length, + uint32_t base, uint32_t bits, + uint32_t value); + +/** + * Performs lower bound binary search search for |value|. + * + * A lower bound search returns the first element in the sequence which does + * not compare less than |value|. + * The actual result is stored in |*actual|. + * + * This function is a convenience wrapper for for_lower_bound_search_bits(). It + * expects metadata at the beginning of |in|. Use in combination with + * for_compress_sorted() and for_compress_unsorted(). + */ +extern uint32_t for_lower_bound_search(const uint8_t *in, uint32_t length, + uint32_t value, uint32_t *actual); + +/** + * Performs lower bound binary search search for |value|. + * + * A lower bound search returns the first element in the sequence which does + * not compare less than |value|. + * The actual result is stored in |*actual|. + * + * |base| is the "offset" (or common delta value) of all ints. It is usually + * set to the minimum value of the uncompressed sequence. + * + * Invariant: bits <= 32 + */ +extern uint32_t for_lower_bound_search_bits(const uint8_t *in, uint32_t length, + uint32_t base, uint32_t bits, + uint32_t value, uint32_t *actual); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e */ diff --git a/include/forcodec.h b/include/forcodec.h index e7374b4..6c53aeb 100644 --- a/include/forcodec.h +++ b/include/forcodec.h @@ -23,7 +23,6 @@ #ifndef INCLUDE_FOR_H #define INCLUDE_FOR_H - #include "common.h" #include "codecs.h" #include "util.h" @@ -35,56 +34,57 @@ namespace SIMDCompressionLib { * Optimized scalar implementation of FOR (frame-of-reference) compression */ class ForCODEC : public IntegerCODEC { - public: - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - uint32_t cappedLength = static_cast(std::min(length, std::numeric_limits::max())); - *(uint32_t *)out = cappedLength; - out++; - // for_compress_sorted() would be a bit faster, but requires - // sorted input - nvalue = (4 + for_compress_unsorted((const uint32_t *)in, (uint8_t *)out, - cappedLength) + 3) / 4; - } - - const uint32_t *decodeArray(const uint32_t *in, const size_t, - uint32_t *out, size_t &nvalue) { - nvalue = *in; - in++; - return in + for_uncompress((const uint8_t *)in, out, static_cast(nvalue)); - } +public: + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + uint32_t cappedLength = static_cast( + std::min(length, std::numeric_limits::max())); + *(uint32_t *)out = cappedLength; + out++; + // for_compress_sorted() would be a bit faster, but requires + // sorted input + nvalue = (4 + for_compress_unsorted((const uint32_t *)in, (uint8_t *)out, + cappedLength) + + 3) / + 4; + } - // append a key. - // Returns the new size of the compressed array *in bytes* - size_t appendToByteArray(uint8_t *in, const size_t bytesize, uint32_t /*previous_key*/, - uint32_t key) { - return append(in, bytesize, key); - } + const uint32_t *decodeArray(const uint32_t *in, const size_t, uint32_t *out, + size_t &nvalue) { + nvalue = *in; + in++; + return in + for_uncompress((const uint8_t *)in, out, + static_cast(nvalue)); + } + // append a key. + // Returns the new size of the compressed array *in bytes* + size_t appendToByteArray(uint8_t *in, const size_t bytesize, + uint32_t /*previous_key*/, uint32_t key) { + return append(in, bytesize, key); + } - size_t append(uint8_t *in, const size_t /* unused */, uint32_t value) { - uint32_t length = *(uint32_t *)in; - size_t s = for_append_unsorted(in + 4, length, value); - *(uint32_t *)in = length + 1; - return s + 4; - } + size_t append(uint8_t *in, const size_t /* unused */, uint32_t value) { + uint32_t length = *(uint32_t *)in; + size_t s = for_append_unsorted(in + 4, length, value); + *(uint32_t *)in = length + 1; + return s + 4; + } - size_t findLowerBound(const uint32_t *in, const size_t, - uint32_t key, uint32_t *presult) { - uint32_t length = *in; - in++; - return (size_t)for_lower_bound_search((const uint8_t *)in, length, - key, presult); - } + size_t findLowerBound(const uint32_t *in, const size_t, uint32_t key, + uint32_t *presult) { + uint32_t length = *in; + in++; + return (size_t)for_lower_bound_search((const uint8_t *)in, length, key, + presult); + } - uint32_t select(const uint32_t *in, size_t index) { - in++; // Skip length - return for_select((const uint8_t *)in, static_cast(index)); - } + uint32_t select(const uint32_t *in, size_t index) { + in++; // Skip length + return for_select((const uint8_t *)in, static_cast(index)); + } - string name() const { - return "For"; - } + string name() const { return "For"; } }; } /* namespace */ diff --git a/include/frameofreference.h b/include/frameofreference.h index c2b880c..8bf35c3 100644 --- a/include/frameofreference.h +++ b/include/frameofreference.h @@ -9,7 +9,6 @@ namespace SIMDCompressionLib { - /** * Simple implementation of FOR compression using blocks of * 32 integers. @@ -20,60 +19,56 @@ namespace SIMDCompressionLib { * FOR does not compress particularly well but it supports * fast random access. */ -class FrameOfReference: public IntegerCODEC { +class FrameOfReference : public IntegerCODEC { public: + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + *out = static_cast( + std::min(length, std::numeric_limits::max())); + uint32_t *finalout = compress_length(in, *out, out + 1); + nvalue = finalout - out; + } + + const uint32_t *uncompress_length(const uint32_t *in, uint32_t *out, + uint32_t nvalue); + uint32_t *compress_length(const uint32_t *in, uint32_t length, uint32_t *out); + + const uint32_t *decodeArray(const uint32_t *in, const size_t, uint32_t *out, + size_t &nvalue) { + nvalue = *in; + in++; + return uncompress_length(in, out, static_cast(nvalue)); + } + + // appends the value "value" at the end of the compressed stream. Assumes that + // we have + // the space to do so. + // returns the next (total) size of the compressed output in bytes + // the "currentcompressedsizeinbytes" should be zero when no data has been + // compressed yet + size_t append(uint8_t *inbyte, const size_t currentcompressedsizeinbytes, + uint32_t value); + + // append a key. + // Returns the new size of the compressed array *in bytes* + size_t appendToByteArray(uint8_t *in, const size_t bytesize, + uint32_t /*previous_key*/, uint32_t key) { + return append(in, bytesize, key); + } + + // Performs a lower bound find in the encoded array. + // Returns the index + size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, + uint32_t *presult); + + // Returns a decompressed value in an encoded array + uint32_t select(const uint32_t *in, size_t index); + + string name() const { return "FrameOfReference"; } - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - *out = static_cast(std::min(length, std::numeric_limits::max())); - uint32_t * finalout = compress_length(in, *out, out + 1); - nvalue = finalout - out; - } - - const uint32_t * uncompress_length(const uint32_t * in, uint32_t * out, uint32_t nvalue); - uint32_t * compress_length(const uint32_t * in, uint32_t length, uint32_t * out); - - const uint32_t *decodeArray(const uint32_t *in, const size_t , - uint32_t *out, size_t &nvalue) { - nvalue = *in; - in++; - return uncompress_length(in,out, static_cast(nvalue)); - } - - // appends the value "value" at the end of the compressed stream. Assumes that we have - // the space to do so. - // returns the next (total) size of the compressed output in bytes - // the "currentcompressedsizeinbytes" should be zero when no data has been compressed yet - size_t append(uint8_t *inbyte, const size_t currentcompressedsizeinbytes, uint32_t value); - - // append a key. - // Returns the new size of the compressed array *in bytes* - size_t appendToByteArray(uint8_t *in, const size_t bytesize, uint32_t /*previous_key*/, - uint32_t key) { - return append(in, bytesize, key); - } - - - // Performs a lower bound find in the encoded array. - // Returns the index - size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, - uint32_t *presult); - - - - - // Returns a decompressed value in an encoded array - uint32_t select(const uint32_t *in, size_t index); - - - string name() const { - return "FrameOfReference"; - } private: - }; - /** * Accelerated implementation of FOR compression using blocks of * 128 integers. . @@ -82,63 +77,58 @@ class FrameOfReference: public IntegerCODEC { * FOR does not compress particularly well but it supports * fast random access. */ -class SIMDFrameOfReference: public IntegerCODEC { +class SIMDFrameOfReference : public IntegerCODEC { public: + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + *out = static_cast( + std::min(length, std::numeric_limits::max())); + uint32_t *finalout = simd_compress_length(in, *out, out + 1); + nvalue = finalout - out; + } + + const uint32_t *simd_uncompress_length(const uint32_t *in, uint32_t *out, + uint32_t nvalue); + uint32_t *simd_compress_length(const uint32_t *in, uint32_t length, + uint32_t *out); + uint32_t *simd_compress_length_sorted(const uint32_t *in, uint32_t length, + uint32_t *out); + + const uint32_t *decodeArray(const uint32_t *in, const size_t, uint32_t *out, + size_t &nvalue) { + nvalue = *in; + in++; + return simd_uncompress_length(in, out, static_cast(nvalue)); + } + + // appends the value "value" at the end of the compressed stream. Assumes that + // we have + // the space to do so. + // returns the next (total) size of the compressed output in bytes + // the "currentcompressedsizeinbytes" should be zero when no data has been + // compressed yet + size_t append(uint8_t *inbyte, const size_t currentcompressedsizeinbytes, + uint32_t value); + + // append a key. + // Returns the new size of the compressed array *in bytes* + size_t appendToByteArray(uint8_t *in, const size_t bytesize, + uint32_t /* previous_key*/, uint32_t key) { + return append(in, bytesize, key); + } + + // Performs a lower bound find in the encoded array. + // Returns the index + size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, + uint32_t *presult); + + // Returns a decompressed value in an encoded array + uint32_t select(const uint32_t *in, size_t index); + + string name() const { return "SIMDFrameOfReference"; } - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - *out = static_cast(std::min(length, std::numeric_limits::max())); - uint32_t * finalout = simd_compress_length(in, *out, out + 1); - nvalue = finalout - out; - - } - - const uint32_t * simd_uncompress_length(const uint32_t * in, uint32_t * out, uint32_t nvalue); - uint32_t * simd_compress_length(const uint32_t * in, uint32_t length, uint32_t * out); - uint32_t * simd_compress_length_sorted(const uint32_t * in, uint32_t length, uint32_t * out); - - const uint32_t *decodeArray(const uint32_t *in, const size_t , - uint32_t *out, size_t &nvalue) { - nvalue = *in; - in++; - return simd_uncompress_length(in, out, static_cast(nvalue)); - } - - // appends the value "value" at the end of the compressed stream. Assumes that we have - // the space to do so. - // returns the next (total) size of the compressed output in bytes - // the "currentcompressedsizeinbytes" should be zero when no data has been compressed yet - size_t append(uint8_t *inbyte, const size_t currentcompressedsizeinbytes, uint32_t value); - - // append a key. - // Returns the new size of the compressed array *in bytes* - size_t appendToByteArray(uint8_t *in, const size_t bytesize, uint32_t /* previous_key*/, - uint32_t key) { - return append(in, bytesize, key); - } - - - - // Performs a lower bound find in the encoded array. - // Returns the index - size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, - uint32_t *presult); - - - - - // Returns a decompressed value in an encoded array - uint32_t select(const uint32_t *in, size_t index); - - - string name() const { - return "SIMDFrameOfReference"; - } private: - }; - } - #endif /* INCLUDE_FRAMEOFREFERENCE_H_ */ diff --git a/include/hybm2.h b/include/hybm2.h index 0ddafaa..c01fb78 100644 --- a/include/hybm2.h +++ b/include/hybm2.h @@ -19,624 +19,623 @@ namespace SIMDCompressionLib { - class HybM2 { public: - - // th = 0 means that we select bitmaps as needed - HybM2(IntegerCODEC &c, intersectionfunction inter, uint32_t MaxId, - vector &recovbuffer, - uint32_t th = 32) : - bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), - threshold(th), recovbuffer(recovbuffer), codec(c), - Inter(inter) { + // th = 0 means that we select bitmaps as needed + HybM2(IntegerCODEC &c, intersectionfunction inter, uint32_t MaxId, + vector &recovbuffer, uint32_t th = 32) + : bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), + threshold(th), recovbuffer(recovbuffer), codec(c), Inter(inter) {} + + /** + * Returns the *uncompressed* size of a given posting list (specified + * by ID). + */ + size_t getSizeInInts(uint32_t postId) { return mapuncompsizes[postId]; } + + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * into the data structure. The data will be either converted to a bitmap or + * compressed. + */ + size_t load(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (threshold == 0) + return loadOptimized(postid, data, length); + else if (length * threshold >= mMaxId) + return loadAsBitmap(postid, data, length); + else + return loadAsShortArray(postid, data, length); + } + + /** + * Check whether we have a posting list corresponding to postid + */ + bool hasBeenLoaded(const uint32_t postid) { + return ((shortlistmap.find(postid) != shortlistmap.end()) || + (bitmapmap.find(postid) != bitmapmap.end())); + } + + /** + * Compute the total of the volume of of posting lists + * corresponding to a query. + */ + size_t computeUnpackVolume(const vector &ids) { + size_t answer = 0; + for (uint32_t id : ids) { + answer += mapuncompsizes[id]; } - - /** - * Returns the *uncompressed* size of a given posting list (specified - * by ID). - */ - size_t getSizeInInts(uint32_t postId) { - return mapuncompsizes[postId]; + return answer; + } + + /** + * Compute the total of the intersection of of posting lists + * corresponding to a query, and output how many integers are + * in + * + * ids: the query as a set of posting ids + * out: write to result + * sizeout: will indicate how many integers were written to out. + * + * return unpack volume defined as the total volume of the posting + * lists that were needed to compute the intersection (which can + * be less than the total volume possible due to early abandoning). + */ + size_t intersect(const vector &ids, uint32_t *out, + size_t &sizeout) { + if (ids.empty()) { + sizeout = 0; + return 0; } - - - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * into the data structure. The data will be either converted to a bitmap or compressed. - */ - size_t load(const uint32_t postid, const uint32_t *data, - const uint32_t length) { - if (threshold == 0) - return loadOptimized(postid, data, length); - else if (length * threshold >= mMaxId) - return loadAsBitmap(postid, data, length); - else - return loadAsShortArray(postid, data, length); + vector>>> shortlists; + vector>> bitmaps; + // vector bitmapscard; + + for (uint32_t id : ids) { + if (shortlistmap.find(id) != shortlistmap.end()) + shortlists.push_back(make_pair(mapuncompsizes[id], shortlistmap[id])); + else { + assert(bitmapmap.find(id) != bitmapmap.end()); + bitmaps.push_back(make_pair(mapuncompsizes[id], bitmapmap[id])); + } } - - /** - * Check whether we have a posting list corresponding to postid - */ - bool hasBeenLoaded(const uint32_t postid) { - return ((shortlistmap.find(postid) != shortlistmap.end()) - || (bitmapmap.find(postid) != bitmapmap.end())); - } - - /** - * Compute the total of the volume of of posting lists - * corresponding to a query. - */ - size_t computeUnpackVolume(const vector &ids) { - size_t answer = 0; - for (uint32_t id : ids) { - answer += mapuncompsizes[id]; + size_t unpackVolume = 0; + if (shortlists.empty()) { + if (bitmaps.size() == 1) { + sizeout = bitmaps.front().second->toInts(out); + unpackVolume += sizeout; + return unpackVolume; + } + + BoolArray answer(mMaxId); + bitmaps[0].second->intersect(*bitmaps[1].second, answer); + unpackVolume += bitmaps[0].first + bitmaps[1].first; + for (uint32_t i = 2; i < bitmaps.size(); ++i) { + answer.inplaceIntersect(*bitmaps[i].second); + unpackVolume += bitmaps[i].first; + } + sizeout = answer.toInts(out); + return unpackVolume; + } else { + sort(shortlists.begin(), shortlists.end()); + sort(bitmaps.begin(), bitmaps.end()); + codec.decodeArray(shortlists[0].second->data(), + shortlists[0].second->size(), out, sizeout); + assert(shortlists[0].first == sizeout); + unpackVolume += sizeout; + assert(sizeout == shortlists[0].first); + for (uint32_t i = 1; (sizeout > 0) && (i < shortlists.size()); ++i) { + size_t thissize = recovbuffer.size(); + codec.decodeArray(shortlists[i].second->data(), + shortlists[i].second->size(), recovbuffer.data(), + thissize); + unpackVolume += thissize; + assert(shortlists[i].first == thissize); + sizeout = Inter(out, sizeout, recovbuffer.data(), thissize, out); + } + size_t pos = 0; + for (uint32_t i = 0; (sizeout > 0) && (i < bitmaps.size()); ++i) { + unpackVolume += bitmaps[i].first; + shared_ptr &ba = bitmaps[i].second; + pos = 0; + for (uint32_t i = 0; i < sizeout; ++i) { + if (!ba->get(out[i])) + continue; + else + out[pos++] = out[i]; } - return answer; + sizeout = pos; + } + return unpackVolume; } + } - /** - * Compute the total of the intersection of of posting lists - * corresponding to a query, and output how many integers are - * in - * - * ids: the query as a set of posting ids - * out: write to result - * sizeout: will indicate how many integers were written to out. - * - * return unpack volume defined as the total volume of the posting - * lists that were needed to compute the intersection (which can - * be less than the total volume possible due to early abandoning). - */ - size_t intersect(const vector &ids, uint32_t *out, - size_t &sizeout) { - if (ids.empty()) { - sizeout = 0; - return 0; - } - vector>>> shortlists; - vector>> bitmaps; - //vector bitmapscard; - - for (uint32_t id : ids) { - if (shortlistmap.find(id) != shortlistmap.end()) - shortlists.push_back(make_pair(mapuncompsizes[id], shortlistmap[id])); - else { - assert(bitmapmap.find(id) != bitmapmap.end()); - bitmaps.push_back(make_pair(mapuncompsizes[id], bitmapmap[id])); - } - } - size_t unpackVolume = 0; - if (shortlists.empty()) { - if (bitmaps.size() == 1) { - sizeout = bitmaps.front().second->toInts(out); - unpackVolume += sizeout; - return unpackVolume; - } - - BoolArray answer(mMaxId); - bitmaps[0].second->intersect(*bitmaps[1].second, answer); - unpackVolume += bitmaps[0].first + bitmaps[1].first; - for (uint32_t i = 2; i < bitmaps.size(); ++i) { - answer.inplaceIntersect(*bitmaps[i].second); - unpackVolume += bitmaps[i].first; - } - sizeout = answer.toInts(out); - return unpackVolume; - } else { - sort(shortlists.begin(), shortlists.end()); - sort(bitmaps.begin(), bitmaps.end()); - codec.decodeArray(shortlists[0].second->data(), shortlists[0].second->size(), out, sizeout); - assert(shortlists[0].first == sizeout); - unpackVolume += sizeout; - assert(sizeout == shortlists[0].first); - for (uint32_t i = 1; (sizeout > 0) && (i < shortlists.size()); ++i) { - size_t thissize = recovbuffer.size(); - codec.decodeArray(shortlists[i].second->data(), shortlists[i].second->size(), - recovbuffer.data(), thissize); - unpackVolume += thissize; - assert(shortlists[i].first == thissize); - sizeout = Inter(out, sizeout, recovbuffer.data(), thissize, out); - - } - size_t pos = 0; - for (uint32_t i = 0; (sizeout > 0) && (i < bitmaps.size()); ++i) { - unpackVolume += bitmaps[i].first; - shared_ptr &ba = bitmaps[i].second; - pos = 0; - for (uint32_t i = 0; i < sizeout; ++i) { - if (!ba->get(out[i])) - continue; - else - out[pos++] = out[i]; - } - sizeout = pos; - } - return unpackVolume; - } - } - - ~HybM2() {} - - /** - * Estimate of the volume of data used by this object. - */ - size_t storageInBytes() const { - size_t answer = 0; - for (auto i : bitmapmap) - answer += i.second->sizeInBytes(); - for (auto i : shortlistmap) - answer += i.second->size() * sizeof(uint32_t); - return answer; - } + ~HybM2() {} - size_t sizeOfRecoveryBufferInWords() const { - return recovbuffer.size(); - } -private: + /** + * Estimate of the volume of data used by this object. + */ + size_t storageInBytes() const { + size_t answer = 0; + for (auto i : bitmapmap) + answer += i.second->sizeInBytes(); + for (auto i : shortlistmap) + answer += i.second->size() * sizeof(uint32_t); + return answer; + } - // load as either a bitmap or a compressed short list - size_t loadOptimized(const uint32_t postid, const uint32_t *data, const uint32_t length) { - if (mapuncompsizes.find(postid) != mapuncompsizes.end()) return 0; - vector *compressedbuffer = new vector(length + 1024); - size_t outlength = compressedbuffer->size(); - vector tmp(data, data + length); // use the buffer because some codecs modify the input - codec.encodeArray(tmp.data(), length, compressedbuffer->data(), outlength); - if (outlength * sizeof(uint32_t) < BoolArray::sizeInBytes(mMaxId)) { // we are good - if (recovbuffer.size() < length) recovbuffer.resize(length); - compressedbuffer->resize(outlength); - compressedbuffer->shrink_to_fit(); - shortlistmap[postid] = shared_ptr>(compressedbuffer); - mapuncompsizes[postid] = length; - return compressedbuffer->size(); - } else { - delete compressedbuffer; - return loadAsBitmap(postid, data, length); - } - } - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * as a bitmap. - * - * Do not call this directly, call load() instead. - */ - size_t loadAsBitmap(const uint32_t postid, const uint32_t *data, const uint32_t length) { - if (bitmapmap.find(postid) != bitmapmap.end()) return 0; - BoolArray *ba = new BoolArray(mMaxId); - for (uint32_t k = 0; k < length; ++k) - ba->set(data[k]); - bitmapmap[postid] = shared_ptr(ba); - mapuncompsizes[postid] = length; - return ba->sizeInBytes() / sizeof(uint32_t); - } + size_t sizeOfRecoveryBufferInWords() const { return recovbuffer.size(); } - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * as a short array. - * - * Do not call this directly, call load() instead. - */ - size_t loadAsShortArray(const uint32_t postid, const uint32_t *data, const uint32_t length) { - if (shortlistmap.find(postid) != shortlistmap.end()) return 0; - if (recovbuffer.size() < length) recovbuffer.resize(length); - vector *compressedbuffer = new vector(length + 1024); - size_t outlength = compressedbuffer->size(); - for (size_t i = 0; i < length; ++i) // use the buffer because some codecs modify the input - recovbuffer[i] = data[i]; - codec.encodeArray(recovbuffer.data(), length, compressedbuffer->data(), outlength); - compressedbuffer->resize(outlength); - compressedbuffer->shrink_to_fit(); - shortlistmap[postid] = shared_ptr>(compressedbuffer); - mapuncompsizes[postid] = length; - return compressedbuffer->size(); +private: + // load as either a bitmap or a compressed short list + size_t loadOptimized(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (mapuncompsizes.find(postid) != mapuncompsizes.end()) + return 0; + vector *compressedbuffer = new vector(length + 1024); + size_t outlength = compressedbuffer->size(); + vector tmp( + data, + data + length); // use the buffer because some codecs modify the input + codec.encodeArray(tmp.data(), length, compressedbuffer->data(), outlength); + if (outlength * sizeof(uint32_t) < + BoolArray::sizeInBytes(mMaxId)) { // we are good + if (recovbuffer.size() < length) + recovbuffer.resize(length); + compressedbuffer->resize(outlength); + compressedbuffer->shrink_to_fit(); + shortlistmap[postid] = shared_ptr>(compressedbuffer); + mapuncompsizes[postid] = length; + return compressedbuffer->size(); + } else { + delete compressedbuffer; + return loadAsBitmap(postid, data, length); } - - map> bitmapmap; - map>> shortlistmap; - map mapuncompsizes; - - const size_t mMaxId; //max value that can be stored in a list - const size_t threshold;//// 32 seems to be the recommended setting, no need to change it? - - vector &recovbuffer; - - IntegerCODEC &codec; // how we compress the short lists - intersectionfunction Inter; - + } + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * as a bitmap. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsBitmap(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (bitmapmap.find(postid) != bitmapmap.end()) + return 0; + BoolArray *ba = new BoolArray(mMaxId); + for (uint32_t k = 0; k < length; ++k) + ba->set(data[k]); + bitmapmap[postid] = shared_ptr(ba); + mapuncompsizes[postid] = length; + return ba->sizeInBytes() / sizeof(uint32_t); + } + + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * as a short array. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsShortArray(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (shortlistmap.find(postid) != shortlistmap.end()) + return 0; + if (recovbuffer.size() < length) + recovbuffer.resize(length); + vector *compressedbuffer = new vector(length + 1024); + size_t outlength = compressedbuffer->size(); + for (size_t i = 0; i < length; + ++i) // use the buffer because some codecs modify the input + recovbuffer[i] = data[i]; + codec.encodeArray(recovbuffer.data(), length, compressedbuffer->data(), + outlength); + compressedbuffer->resize(outlength); + compressedbuffer->shrink_to_fit(); + shortlistmap[postid] = shared_ptr>(compressedbuffer); + mapuncompsizes[postid] = length; + return compressedbuffer->size(); + } + + map> bitmapmap; + map>> shortlistmap; + map mapuncompsizes; + + const size_t mMaxId; // max value that can be stored in a list + const size_t threshold; //// 32 seems to be the recommended setting, no need + ///to change it? + + vector &recovbuffer; + + IntegerCODEC &codec; // how we compress the short lists + intersectionfunction Inter; }; - - /** * This is a version of HybM2 without compression (other than the bitmaps). */ class UncompressedHybM2 { public: - - UncompressedHybM2(intersectionfunction inter, uint32_t MaxId, - uint32_t th = 32) : - bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), - threshold(th), - Inter(inter) { - } - - /** - * Returns the *uncompressed* size of a given posting list (specified - * by ID). - */ - size_t getSizeInInts(uint32_t postId) { - return mapuncompsizes[postId]; - } - - - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * into the data structure. The data will be either converted to a bitmap or compressed. - */ - size_t load(const uint32_t postid, const uint32_t *data, - const uint32_t length) { - if (length * threshold >= mMaxId) - return loadAsBitmap(postid, data, length); - else - return loadAsShortArray(postid, data, length); + UncompressedHybM2(intersectionfunction inter, uint32_t MaxId, + uint32_t th = 32) + : bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), + threshold(th), Inter(inter) {} + + /** + * Returns the *uncompressed* size of a given posting list (specified + * by ID). + */ + size_t getSizeInInts(uint32_t postId) { return mapuncompsizes[postId]; } + + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * into the data structure. The data will be either converted to a bitmap or + * compressed. + */ + size_t load(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (length * threshold >= mMaxId) + return loadAsBitmap(postid, data, length); + else + return loadAsShortArray(postid, data, length); + } + + /** + * Check whether we have a posting list corresponding to postid + */ + bool hasBeenLoaded(const uint32_t postid) { + return ((shortlistmap.find(postid) != shortlistmap.end()) || + (bitmapmap.find(postid) != bitmapmap.end())); + } + + /** + * Compute the total of the volume of of posting lists + * corresponding to a query. + */ + size_t computeUnpackVolume(const vector &ids) { + size_t answer = 0; + for (uint32_t id : ids) { + answer += mapuncompsizes[id]; } - - /** - * Check whether we have a posting list corresponding to postid - */ - bool hasBeenLoaded(const uint32_t postid) { - return ((shortlistmap.find(postid) != shortlistmap.end()) - || (bitmapmap.find(postid) != bitmapmap.end())); + return answer; + } + + /** + * Compute the total of the intersection of of posting lists + * corresponding to a query, and output how many integers are + * in + * + * ids: the query as a set of posting ids + * out: write to result + * sizeout: will indicate how many integers were written to out. + * + * return unpack volume defined as the total volume of the posting + * lists that were needed to compute the intersection (which can + * be less than the total volume possible due to early abandoning). + */ + size_t intersect(const vector &ids, uint32_t *out, + size_t &sizeout) { + if (ids.empty()) { + sizeout = 0; + return 0; } - - /** - * Compute the total of the volume of of posting lists - * corresponding to a query. - */ - size_t computeUnpackVolume(const vector &ids) { - size_t answer = 0; - for (uint32_t id : ids) { - answer += mapuncompsizes[id]; - } - return answer; + vector>>> shortlists; + vector>> bitmaps; + // vector bitmapscard; + + for (uint32_t id : ids) { + if (shortlistmap.find(id) != shortlistmap.end()) + shortlists.push_back(make_pair(mapuncompsizes[id], shortlistmap[id])); + else { + assert(bitmapmap.find(id) != bitmapmap.end()); + bitmaps.push_back(make_pair(mapuncompsizes[id], bitmapmap[id])); + } } - - - - /** - * Compute the total of the intersection of of posting lists - * corresponding to a query, and output how many integers are - * in - * - * ids: the query as a set of posting ids - * out: write to result - * sizeout: will indicate how many integers were written to out. - * - * return unpack volume defined as the total volume of the posting - * lists that were needed to compute the intersection (which can - * be less than the total volume possible due to early abandoning). - */ - size_t intersect(const vector &ids, uint32_t *out, - size_t &sizeout) { - if (ids.empty()) { - sizeout = 0; - return 0; - } - vector>>> shortlists; - vector>> bitmaps; - //vector bitmapscard; - - for (uint32_t id : ids) { - if (shortlistmap.find(id) != shortlistmap.end()) - shortlists.push_back(make_pair(mapuncompsizes[id], shortlistmap[id])); - else { - assert(bitmapmap.find(id) != bitmapmap.end()); - bitmaps.push_back(make_pair(mapuncompsizes[id], bitmapmap[id])); - } - } - size_t unpackVolume = 0; - if (shortlists.empty()) { - if (bitmaps.size() == 1) { - sizeout = bitmaps.front().second->toInts(out); - unpackVolume += sizeout; - return unpackVolume; - } - - BoolArray answer(mMaxId); - bitmaps[0].second->intersect(*bitmaps[1].second, answer); - unpackVolume += bitmaps[0].first + bitmaps[1].first; - for (uint32_t i = 2; i < bitmaps.size(); ++i) { - answer.inplaceIntersect(*bitmaps[i].second); - unpackVolume += bitmaps[i].first; - } - sizeout = answer.toInts(out); - return unpackVolume; - } else { - sort(shortlists.begin(), shortlists.end()); - sort(bitmaps.begin(), bitmaps.end()); - assert(sizeout >= shortlists[0].second->size()); - sizeout = shortlists[0].second->size(); - unpackVolume += shortlists[0].second->size(); - assert(sizeout == shortlists[0].first); - // we have to make a copy because by convention the output is not directly from the index - const vector &firstvector = *shortlists[0].second; - for (uint32_t i = 0; i < firstvector.size(); ++i) - out[i] = firstvector[i]; - for (uint32_t i = 1; (sizeout > 0) && (i < shortlists.size()); ++i) { - unpackVolume += shortlists[i].first; - sizeout = Inter(out, sizeout, shortlists[i].second->data(), shortlists[i].second->size(), out); - } - size_t pos = 0; - for (uint32_t i = 0; (sizeout > 0) && (i < bitmaps.size()); ++i) { - unpackVolume += bitmaps[i].first; - shared_ptr &ba = bitmaps[i].second; - pos = 0; - for (uint32_t i = 0; i < sizeout; ++i) { - if (!ba->get(out[i])) - continue; - else - out[pos++] = out[i]; - } - sizeout = pos; - } - return unpackVolume; + size_t unpackVolume = 0; + if (shortlists.empty()) { + if (bitmaps.size() == 1) { + sizeout = bitmaps.front().second->toInts(out); + unpackVolume += sizeout; + return unpackVolume; + } + + BoolArray answer(mMaxId); + bitmaps[0].second->intersect(*bitmaps[1].second, answer); + unpackVolume += bitmaps[0].first + bitmaps[1].first; + for (uint32_t i = 2; i < bitmaps.size(); ++i) { + answer.inplaceIntersect(*bitmaps[i].second); + unpackVolume += bitmaps[i].first; + } + sizeout = answer.toInts(out); + return unpackVolume; + } else { + sort(shortlists.begin(), shortlists.end()); + sort(bitmaps.begin(), bitmaps.end()); + assert(sizeout >= shortlists[0].second->size()); + sizeout = shortlists[0].second->size(); + unpackVolume += shortlists[0].second->size(); + assert(sizeout == shortlists[0].first); + // we have to make a copy because by convention the output is not directly + // from the index + const vector &firstvector = *shortlists[0].second; + for (uint32_t i = 0; i < firstvector.size(); ++i) + out[i] = firstvector[i]; + for (uint32_t i = 1; (sizeout > 0) && (i < shortlists.size()); ++i) { + unpackVolume += shortlists[i].first; + sizeout = Inter(out, sizeout, shortlists[i].second->data(), + shortlists[i].second->size(), out); + } + size_t pos = 0; + for (uint32_t i = 0; (sizeout > 0) && (i < bitmaps.size()); ++i) { + unpackVolume += bitmaps[i].first; + shared_ptr &ba = bitmaps[i].second; + pos = 0; + for (uint32_t i = 0; i < sizeout; ++i) { + if (!ba->get(out[i])) + continue; + else + out[pos++] = out[i]; } + sizeout = pos; + } + return unpackVolume; } - - ~UncompressedHybM2() {} - - /** - * Estimate of the volume of data used by this object. - */ - size_t storageInBytes() const { - size_t answer = 0; - for (auto i : bitmapmap) - answer += i.second->sizeInBytes(); - for (auto i : shortlistmap) - answer += i.second->size() * sizeof(uint32_t); - return answer; - } - + } + + ~UncompressedHybM2() {} + + /** + * Estimate of the volume of data used by this object. + */ + size_t storageInBytes() const { + size_t answer = 0; + for (auto i : bitmapmap) + answer += i.second->sizeInBytes(); + for (auto i : shortlistmap) + answer += i.second->size() * sizeof(uint32_t); + return answer; + } private: - - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * as a bitmap. - * - * Do not call this directly, call load() instead. - */ - size_t loadAsBitmap(const uint32_t postid, const uint32_t *data, const uint32_t length) { - if (bitmapmap.find(postid) != bitmapmap.end()) return 0; - BoolArray *ba = new BoolArray(mMaxId); - for (uint32_t k = 0; k < length; ++k) - ba->set(data[k]); - bitmapmap[postid] = shared_ptr(ba); - mapuncompsizes[postid] = length; - return ba->sizeInBytes() / sizeof(uint32_t); - } - - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * as a short array. - * - * Do not call this directly, call load() instead. - */ - size_t loadAsShortArray(const uint32_t postid, const uint32_t *data, const uint32_t length) { - if (shortlistmap.find(postid) != shortlistmap.end()) return 0; - mapuncompsizes[postid] = length; - vector *compressedbuffer = new vector(data, data + length); - shortlistmap[postid] = shared_ptr>(compressedbuffer); - return compressedbuffer->size(); - } - - map> bitmapmap; - map>> shortlistmap; - map mapuncompsizes; - - const size_t mMaxId; //max value that can be stored in a list - const size_t threshold;//// 32 seems to be the recommended setting, no need to change it? - - - intersectionfunction Inter; - + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * as a bitmap. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsBitmap(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (bitmapmap.find(postid) != bitmapmap.end()) + return 0; + BoolArray *ba = new BoolArray(mMaxId); + for (uint32_t k = 0; k < length; ++k) + ba->set(data[k]); + bitmapmap[postid] = shared_ptr(ba); + mapuncompsizes[postid] = length; + return ba->sizeInBytes() / sizeof(uint32_t); + } + + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * as a short array. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsShortArray(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (shortlistmap.find(postid) != shortlistmap.end()) + return 0; + mapuncompsizes[postid] = length; + vector *compressedbuffer = + new vector(data, data + length); + shortlistmap[postid] = shared_ptr>(compressedbuffer); + return compressedbuffer->size(); + } + + map> bitmapmap; + map>> shortlistmap; + map mapuncompsizes; + + const size_t mMaxId; // max value that can be stored in a list + const size_t threshold; //// 32 seems to be the recommended setting, no need + ///to change it? + + intersectionfunction Inter; }; - - - /** - * This is a version of HybM2 with a skipping data structure akin to what is described + * This is a version of HybM2 with a skipping data structure akin to what is + * described * by Culpepper and Moffat. We seem to get no gain from this approach. */ class SkippingHybM2 { public: - - SkippingHybM2(uint32_t MaxId, - uint32_t th = 32, uint32_t BS = 8) : - bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), - threshold(th), - BlockSizeLog(BS) { - } - - /** - * Returns the *uncompressed* size of a given posting list (specified - * by ID). - */ - size_t getSizeInInts(uint32_t postId) { - return mapuncompsizes[postId]; - } - - - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * into the data structure. The data will be either converted to a bitmap or compressed. - */ - size_t load(const uint32_t postid, const uint32_t *data, - const uint32_t length) { - if (length * threshold >= mMaxId) - return loadAsBitmap(postid, data, length); - else - return loadAsShortArray(postid, data, length); + SkippingHybM2(uint32_t MaxId, uint32_t th = 32, uint32_t BS = 8) + : bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), + threshold(th), BlockSizeLog(BS) {} + + /** + * Returns the *uncompressed* size of a given posting list (specified + * by ID). + */ + size_t getSizeInInts(uint32_t postId) { return mapuncompsizes[postId]; } + + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * into the data structure. The data will be either converted to a bitmap or + * compressed. + */ + size_t load(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (length * threshold >= mMaxId) + return loadAsBitmap(postid, data, length); + else + return loadAsShortArray(postid, data, length); + } + + /** + * Check whether we have a posting list corresponding to postid + */ + bool hasBeenLoaded(const uint32_t postid) { + return ((shortlistmap.find(postid) != shortlistmap.end()) || + (bitmapmap.find(postid) != bitmapmap.end())); + } + + /** + * Compute the total of the volume of of posting lists + * corresponding to a query. + */ + size_t computeUnpackVolume(const vector &ids) { + size_t answer = 0; + for (uint32_t id : ids) { + answer += mapuncompsizes[id]; } - - /** - * Check whether we have a posting list corresponding to postid - */ - bool hasBeenLoaded(const uint32_t postid) { - return ((shortlistmap.find(postid) != shortlistmap.end()) - || (bitmapmap.find(postid) != bitmapmap.end())); + return answer; + } + + /** + * Compute the total of the intersection of of posting lists + * corresponding to a query, and output how many integers are + * in + * + * ids: the query as a set of posting ids + * out: write to result + * sizeout: will indicate how many integers were written to out. + * + * return unpack volume defined as the total volume of the posting + * lists that were needed to compute the intersection (which can + * be less than the total volume possible due to early abandoning). + */ + size_t intersect(const vector &ids, uint32_t *out, + size_t &sizeout) { + if (ids.empty()) { + sizeout = 0; + return 0; } - - /** - * Compute the total of the volume of of posting lists - * corresponding to a query. - */ - size_t computeUnpackVolume(const vector &ids) { - size_t answer = 0; - for (uint32_t id : ids) { - answer += mapuncompsizes[id]; - } - return answer; + vector>> shortlists; + vector>> bitmaps; + for (uint32_t id : ids) { + if (shortlistmap.find(id) != shortlistmap.end()) + shortlists.push_back(make_pair(mapuncompsizes[id], shortlistmap[id])); + else { + assert(bitmapmap.find(id) != bitmapmap.end()); + bitmaps.push_back(make_pair(mapuncompsizes[id], bitmapmap[id])); + } } - - - - /** - * Compute the total of the intersection of of posting lists - * corresponding to a query, and output how many integers are - * in - * - * ids: the query as a set of posting ids - * out: write to result - * sizeout: will indicate how many integers were written to out. - * - * return unpack volume defined as the total volume of the posting - * lists that were needed to compute the intersection (which can - * be less than the total volume possible due to early abandoning). - */ - size_t intersect(const vector &ids, uint32_t *out, - size_t &sizeout) { - if (ids.empty()) { - sizeout = 0; - return 0; - } - vector>> shortlists; - vector>> bitmaps; - for (uint32_t id : ids) { - if (shortlistmap.find(id) != shortlistmap.end()) - shortlists.push_back(make_pair(mapuncompsizes[id], shortlistmap[id])); - else { - assert(bitmapmap.find(id) != bitmapmap.end()); - bitmaps.push_back(make_pair(mapuncompsizes[id], bitmapmap[id])); - } + size_t unpackVolume = 0; + if (shortlists.empty()) { + if (bitmaps.size() == 1) { + sizeout = bitmaps.front().second->toInts(out); + unpackVolume += sizeout; + return unpackVolume; + } + + BoolArray answer(mMaxId); + bitmaps[0].second->intersect(*bitmaps[1].second, answer); + unpackVolume += bitmaps[0].first + bitmaps[1].first; + for (uint32_t i = 2; i < bitmaps.size(); ++i) { + answer.inplaceIntersect(*bitmaps[i].second); + unpackVolume += bitmaps[i].first; + } + sizeout = answer.toInts(out); + return unpackVolume; + } else { + sort(shortlists.begin(), shortlists.end()); + sort(bitmaps.begin(), bitmaps.end()); + if (shortlists.size() == 1) { + sizeout = shortlists[0].second->decompress(out); + unpackVolume += shortlists[0].second->Length; + } else { + unpackVolume += shortlists[0].second->Length; + unpackVolume += shortlists[1].second->Length; + sizeout = shortlists[0].second->intersect(*shortlists[1].second, out); + for (uint32_t i = 2; (sizeout > 0) && (i < shortlists.size()); ++i) { + unpackVolume += shortlists[i].first; + sizeout = shortlists[i].second->intersect(out, sizeout, out); } - size_t unpackVolume = 0; - if (shortlists.empty()) { - if (bitmaps.size() == 1) { - sizeout = bitmaps.front().second->toInts(out); - unpackVolume += sizeout; - return unpackVolume; - } - - BoolArray answer(mMaxId); - bitmaps[0].second->intersect(*bitmaps[1].second, answer); - unpackVolume += bitmaps[0].first + bitmaps[1].first; - for (uint32_t i = 2; i < bitmaps.size(); ++i) { - answer.inplaceIntersect(*bitmaps[i].second); - unpackVolume += bitmaps[i].first; - } - sizeout = answer.toInts(out); - return unpackVolume; - } else { - sort(shortlists.begin(), shortlists.end()); - sort(bitmaps.begin(), bitmaps.end()); - if (shortlists.size() == 1) { - sizeout = shortlists[0].second->decompress(out); - unpackVolume += shortlists[0].second->Length; - } else { - unpackVolume += shortlists[0].second->Length; - unpackVolume += shortlists[1].second->Length; - sizeout = shortlists[0].second->intersect(*shortlists[1].second, out); - for (uint32_t i = 2; (sizeout > 0) && (i < shortlists.size()); ++i) { - unpackVolume += shortlists[i].first; - sizeout = shortlists[i].second->intersect(out, sizeout, out); - } - } - size_t pos = 0; - for (uint32_t i = 0; (sizeout > 0) && (i < bitmaps.size()); ++i) { - unpackVolume += bitmaps[i].first; - shared_ptr &ba = bitmaps[i].second; - pos = 0; - for (uint32_t i = 0; i < sizeout; ++i) { - if (!ba->get(out[i])) - continue; - else - out[pos++] = out[i]; - } - sizeout = pos; - } - return unpackVolume; + } + size_t pos = 0; + for (uint32_t i = 0; (sizeout > 0) && (i < bitmaps.size()); ++i) { + unpackVolume += bitmaps[i].first; + shared_ptr &ba = bitmaps[i].second; + pos = 0; + for (uint32_t i = 0; i < sizeout; ++i) { + if (!ba->get(out[i])) + continue; + else + out[pos++] = out[i]; } + sizeout = pos; + } + return unpackVolume; } - - ~SkippingHybM2() {} - - /** - * Estimate of the volume of data used by this object. - */ - size_t storageInBytes() const { - size_t answer = 0; - for (auto i : bitmapmap) - answer += i.second->sizeInBytes(); - for (auto i : shortlistmap) - answer += i.second->storageInBytes(); - return answer; - } - + } + + ~SkippingHybM2() {} + + /** + * Estimate of the volume of data used by this object. + */ + size_t storageInBytes() const { + size_t answer = 0; + for (auto i : bitmapmap) + answer += i.second->sizeInBytes(); + for (auto i : shortlistmap) + answer += i.second->storageInBytes(); + return answer; + } private: - - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * as a bitmap. - * - * Do not call this directly, call load() instead. - */ - size_t loadAsBitmap(const uint32_t postid, const uint32_t *data, const uint32_t length) { - if (bitmapmap.find(postid) != bitmapmap.end()) return 0; - BoolArray *ba = new BoolArray(mMaxId); - for (uint32_t k = 0; k < length; ++k) - ba->set(data[k]); - bitmapmap[postid] = shared_ptr(ba); - mapuncompsizes[postid] = length; - return ba->sizeInBytes() / sizeof(uint32_t); - } - - /** - * Load an array (data) of length "length" as the posting list corresponding to id postid - * as a short array. - * - * Do not call this directly, call load() instead. - */ - size_t loadAsShortArray(const uint32_t postid, const uint32_t *data, const uint32_t length) { - if (shortlistmap.find(postid) != shortlistmap.end()) return 0; - - Skipping *compressedbuffer = new Skipping(BlockSizeLog, data, length); - shortlistmap[postid] = shared_ptr(compressedbuffer); - return compressedbuffer->storageInBytes() / sizeof(uint32_t); - } - - map> bitmapmap; - map> shortlistmap; - map mapuncompsizes; - - const size_t mMaxId; //max value that can be stored in a list - const size_t threshold;//// 32 seems to be the recommended setting, no need to change it? - uint32_t BlockSizeLog; - - - + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * as a bitmap. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsBitmap(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (bitmapmap.find(postid) != bitmapmap.end()) + return 0; + BoolArray *ba = new BoolArray(mMaxId); + for (uint32_t k = 0; k < length; ++k) + ba->set(data[k]); + bitmapmap[postid] = shared_ptr(ba); + mapuncompsizes[postid] = length; + return ba->sizeInBytes() / sizeof(uint32_t); + } + + /** + * Load an array (data) of length "length" as the posting list corresponding + * to id postid + * as a short array. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsShortArray(const uint32_t postid, const uint32_t *data, + const uint32_t length) { + if (shortlistmap.find(postid) != shortlistmap.end()) + return 0; + + Skipping *compressedbuffer = new Skipping(BlockSizeLog, data, length); + shortlistmap[postid] = shared_ptr(compressedbuffer); + return compressedbuffer->storageInBytes() / sizeof(uint32_t); + } + + map> bitmapmap; + map> shortlistmap; + map mapuncompsizes; + + const size_t mMaxId; // max value that can be stored in a list + const size_t threshold; //// 32 seems to be the recommended setting, no need + ///to change it? + uint32_t BlockSizeLog; }; } // namespace SIMDCompressionLib diff --git a/include/integratedbitpacking.h b/include/integratedbitpacking.h index b38e900..bf18c3b 100644 --- a/include/integratedbitpacking.h +++ b/include/integratedbitpacking.h @@ -11,131 +11,205 @@ namespace SIMDCompressionLib { +void __integratedfastunpack0(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack1(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack2(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack3(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack4(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack5(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack6(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack7(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack8(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack9(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack10(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack11(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack12(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack13(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack14(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack15(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack16(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack17(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack18(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack19(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack20(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack21(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack22(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack23(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack24(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack25(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack26(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack27(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack28(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack29(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack30(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack31(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastunpack32(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); -void __integratedfastunpack0(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack1(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack2(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack3(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack4(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack5(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack6(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack7(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack8(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack9(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack10(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack11(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack12(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack13(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack14(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack15(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack16(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack17(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack18(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack19(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack20(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack21(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack22(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack23(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack24(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack25(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack26(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack27(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack28(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack29(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack30(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack31(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastunpack32(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); - - -void __integratedfastpack0(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack1(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack2(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack3(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack4(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack5(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack6(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack7(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack8(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack9(const uint32_t initoffset, const uint32_t *__restrict__ in, uint32_t *__restrict__ out); -void __integratedfastpack10(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack11(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack12(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack13(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack14(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack15(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack16(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack17(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack18(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack19(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack20(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack21(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack22(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack23(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack24(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack25(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack26(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack27(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack28(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack29(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack30(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack31(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); -void __integratedfastpack32(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out); +void __integratedfastpack0(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack1(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack2(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack3(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack4(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack5(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack6(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack7(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack8(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack9(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack10(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack11(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack12(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack13(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack14(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack15(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack16(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack17(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack18(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack19(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack20(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack21(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack22(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack23(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack24(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack25(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack26(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack27(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack28(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack29(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack30(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack31(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __integratedfastpack32(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); } // namespace SIMDCompressionLib diff --git a/include/intersection.h b/include/intersection.h index ad60ab9..4616a96 100644 --- a/include/intersection.h +++ b/include/intersection.h @@ -13,8 +13,9 @@ using namespace std; * cardinality of the intersection. */ typedef size_t (*intersectionfunction)(const uint32_t *set1, - const size_t length1, const uint32_t *set2, const size_t length2, uint32_t *out); - + const size_t length1, + const uint32_t *set2, + const size_t length2, uint32_t *out); /* * Given two arrays, this writes the intersection to out. Returns the @@ -23,9 +24,9 @@ typedef size_t (*intersectionfunction)(const uint32_t *set1, * This is a mix of very fast vectorized intersection algorithms, several * designed by N. Kurz, with adaptations by D. Lemire. */ -size_t SIMDintersection(const uint32_t *set1, - const size_t length1, const uint32_t *set2, const size_t length2, uint32_t *out); - +size_t SIMDintersection(const uint32_t *set1, const size_t length1, + const uint32_t *set2, const size_t length2, + uint32_t *out); /* * Given two arrays, this writes the intersection to out. Returns the @@ -45,54 +46,52 @@ size_t nate_scalar(const uint32_t *set1, const size_t length1, * by D. Lemire. */ size_t onesidedgallopingintersection(const uint32_t *smallset, - const size_t smalllength, const uint32_t *largeset, - const size_t largelength, uint32_t *out) ; - - - - - + const size_t smalllength, + const uint32_t *largeset, + const size_t largelength, uint32_t *out); class IntersectionFactory { public: - static std::map intersection_schemes; + static std::map intersection_schemes; - static vector allNames() { - vector ans; - for (auto i = intersection_schemes.begin(); i != intersection_schemes.end(); ++i) { - ans.push_back(i->first); - } - return ans; + static vector allNames() { + vector ans; + for (auto i = intersection_schemes.begin(); i != intersection_schemes.end(); + ++i) { + ans.push_back(i->first); } - - static string getName(intersectionfunction v) { - for (auto i = intersection_schemes.begin(); i != intersection_schemes.end() ; ++i) { - if (i->second == v) - return i->first; - } - return "UNKNOWN"; - } - - static bool valid(string name) { - return (intersection_schemes.find(name) != intersection_schemes.end()) ; + return ans; + } + + static string getName(intersectionfunction v) { + for (auto i = intersection_schemes.begin(); i != intersection_schemes.end(); + ++i) { + if (i->second == v) + return i->first; } - - static intersectionfunction getFromName(string name) { - if (intersection_schemes.find(name) == intersection_schemes.end()) { - cerr << "name " << name << " does not refer to an intersection procedure." << endl; - cerr << "possible choices:" << endl; - for (auto i = intersection_schemes.begin(); i != intersection_schemes.end(); ++i) { - cerr << static_cast(i->first) << endl; // useless cast, but just to be clear - } - return NULL; - } - return intersection_schemes[name]; + return "UNKNOWN"; + } + + static bool valid(string name) { + return (intersection_schemes.find(name) != intersection_schemes.end()); + } + + static intersectionfunction getFromName(string name) { + if (intersection_schemes.find(name) == intersection_schemes.end()) { + cerr << "name " << name << " does not refer to an intersection procedure." + << endl; + cerr << "possible choices:" << endl; + for (auto i = intersection_schemes.begin(); + i != intersection_schemes.end(); ++i) { + cerr << static_cast(i->first) + << endl; // useless cast, but just to be clear + } + return NULL; } - + return intersection_schemes[name]; + } }; - - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_INTERSECTION_H_ */ diff --git a/include/mersenne.h b/include/mersenne.h index ae61204..f00dcb1 100644 --- a/include/mersenne.h +++ b/include/mersenne.h @@ -19,82 +19,73 @@ namespace SIMDCompressionLib { class ZRandom { public: - enum { - N = 624, M = 397 - }; - unsigned int MT[N + 1]; - unsigned int *map[N]; - int nValues; - - ZRandom(unsigned int iSeed = 20070102); - void seed(unsigned iSeed); - unsigned int getValue(); - unsigned int getValue(const uint32_t MaxValue); - double getDouble(); - bool test(const double p); + enum { N = 624, M = 397 }; + unsigned int MT[N + 1]; + unsigned int *map[N]; + int nValues; + ZRandom(unsigned int iSeed = 20070102); + void seed(unsigned iSeed); + unsigned int getValue(); + unsigned int getValue(const uint32_t MaxValue); + double getDouble(); + bool test(const double p); }; -ZRandom::ZRandom(unsigned iSeed) : - nValues(0) { - seed(iSeed); -} +ZRandom::ZRandom(unsigned iSeed) : nValues(0) { seed(iSeed); } void ZRandom::seed(unsigned iSeed) { - nValues = 0; - // Seed the array used in random number generation. - MT[0] = iSeed; - for (int i = 1; i < N; ++i) { - MT[i] = 1 + (69069 * MT[i - 1]); - } - // Compute map once to avoid % in inner loop. - for (int i = 0; i < N; ++i) { - map[i] = MT + ((i + M) % N); - } + nValues = 0; + // Seed the array used in random number generation. + MT[0] = iSeed; + for (int i = 1; i < N; ++i) { + MT[i] = 1 + (69069 * MT[i - 1]); + } + // Compute map once to avoid % in inner loop. + for (int i = 0; i < N; ++i) { + map[i] = MT + ((i + M) % N); + } } -inline bool ZRandom::test(const double p) { - return getDouble() <= p; -} +inline bool ZRandom::test(const double p) { return getDouble() <= p; } inline double ZRandom::getDouble() { - return double(getValue()) * (1.0 / 4294967296.0); + return double(getValue()) * (1.0 / 4294967296.0); } unsigned int ZRandom::getValue(const uint32_t MaxValue) { - unsigned int used = MaxValue; - used |= used >> 1; - used |= used >> 2; - used |= used >> 4; - used |= used >> 8; - used |= used >> 16; + unsigned int used = MaxValue; + used |= used >> 1; + used |= used >> 2; + used |= used >> 4; + used |= used >> 8; + used |= used >> 16; - // Draw numbers until one is found in [0,n] - unsigned int i; - do - i = getValue() & used; // toss unused bits to shorten search - while (i > MaxValue); - return i; + // Draw numbers until one is found in [0,n] + unsigned int i; + do + i = getValue() & used; // toss unused bits to shorten search + while (i > MaxValue); + return i; } unsigned int ZRandom::getValue() { - if (0 == nValues) { - MT[N] = MT[0]; - for (int i = 0; i < N; ++i) { - unsigned y = (0x80000000 & MT[i]) | (0x7FFFFFFF - & MT[i + 1]); - unsigned v = *(map[i]) ^ (y >> 1); - if (1 & y) - v ^= 2567483615; - MT[i] = v; - } - nValues = N; + if (0 == nValues) { + MT[N] = MT[0]; + for (int i = 0; i < N; ++i) { + unsigned y = (0x80000000 & MT[i]) | (0x7FFFFFFF & MT[i + 1]); + unsigned v = *(map[i]) ^ (y >> 1); + if (1 & y) + v ^= 2567483615; + MT[i] = v; } - unsigned y = MT[N - nValues--]; - y ^= y >> 11; - y ^= static_cast((y << 7) & 2636928640); - y ^= static_cast((y << 15) & 4022730752); - y ^= y >> 18; - return y; + nValues = N; + } + unsigned y = MT[N - nValues--]; + y ^= y >> 11; + y ^= static_cast((y << 7) & 2636928640); + y ^= static_cast((y << 15) & 4022730752); + y ^= y >> 18; + return y; } } // namespace SIMDCompressionLib diff --git a/include/platform.h b/include/platform.h index b20ebce..309aef0 100644 --- a/include/platform.h +++ b/include/platform.h @@ -22,25 +22,23 @@ #ifdef _MSC_VER #include -uint32_t __inline __builtin_clz(uint32_t value) -{ - unsigned long leading_zero = 0; - return _BitScanReverse(&leading_zero, value) == 0 ? 0 : (31 - leading_zero); +uint32_t __inline __builtin_clz(uint32_t value) { + unsigned long leading_zero = 0; + return _BitScanReverse(&leading_zero, value) == 0 ? 0 : (31 - leading_zero); } -uint32_t __inline __builtin_ctz(uint32_t value) -{ - unsigned long trailing_zero = 0; - return _BitScanForward(&trailing_zero, value) == 0 ? 32 : trailing_zero; +uint32_t __inline __builtin_ctz(uint32_t value) { + unsigned long trailing_zero = 0; + return _BitScanForward(&trailing_zero, value) == 0 ? 32 : trailing_zero; } -uint32_t __inline __builtin_ctzl(uint64_t value) -{ +uint32_t __inline __builtin_ctzl(uint64_t value) { #ifdef _M_X64 - unsigned long trailing_zero = 0; - return _BitScanForward64(&trailing_zero, value) == 0 ? 64 : trailing_zero; + unsigned long trailing_zero = 0; + return _BitScanForward64(&trailing_zero, value) == 0 ? 64 : trailing_zero; #else - return ((value & 0xFFFFFFFF) == 0) ? (__builtin_ctz(value >> 32) + 32) : __builtin_ctz(value & 0xFFFFFFFF); + return ((value & 0xFFFFFFFF) == 0) ? (__builtin_ctz(value >> 32) + 32) + : __builtin_ctz(value & 0xFFFFFFFF); #endif } #endif diff --git a/include/simdbinarypacking.h b/include/simdbinarypacking.h index 5b24779..879064b 100644 --- a/include/simdbinarypacking.h +++ b/include/simdbinarypacking.h @@ -14,125 +14,129 @@ namespace SIMDCompressionLib { - - extern "C" { -/* searches "bit" 128-bit vectors from "in" (= 128 encoded integers) for the first encoded uint32 value +/* searches "bit" 128-bit vectors from "in" (= 128 encoded integers) for the +* first encoded uint32 value * which is >= |key|, and returns its position. It is assumed that the values * stored are in sorted order. - * The encoded key is stored in "*presult". If no value is larger or equal to the key, + * The encoded key is stored in "*presult". If no value is larger or equal to +* the key, * 128 is returned */ -int -simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit, - uint32_t key, uint32_t *presult); - +int simdsearchd1(__m128i *initOffset, const __m128i *in, uint32_t bit, + uint32_t key, uint32_t *presult); /** * Simply scan, updating initOffset as it proceeds. */ -void -simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit); +void simdscand1(__m128i *initOffset, const __m128i *in, uint32_t bit); /* returns the value stored at the specified "slot". */ -uint32_t simdselectd1(__m128i * initOffset, const __m128i *in, uint32_t bit, - int slot); - - +uint32_t simdselectd1(__m128i *initOffset, const __m128i *in, uint32_t bit, + int slot); } -template -struct SIMDBlockPacker { - typedef SIMDDeltaProcessor DeltaProcessor; - static void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, __m128i &initoffset) { - if (ArrayDispatch) - ArrayDispatch::SIMDunpack(reinterpret_cast(in), out, bit); - else - simdunpack(reinterpret_cast(in), out, bit); - if (bit < 32) { - initoffset = DeltaProcessor::runPrefixSum(initoffset, out); - } else { - initoffset = MM_LOAD_SI_128(reinterpret_cast<__m128i *>(out + SIMDBlockSize - 4)); - } - } - - static uint32_t maxbits(const uint32_t *in, __m128i &initoffset) { - const __m128i *pin = reinterpret_cast(in); - __m128i newvec = MM_LOAD_SI_128(pin); - __m128i accumulator = DeltaHelper::Delta(newvec , initoffset); - __m128i oldvec = newvec; - for (uint32_t k = 1; 4 * k < SIMDBlockSize; ++k) { - newvec = MM_LOAD_SI_128(pin + k); - accumulator = _mm_or_si128(accumulator, DeltaHelper::Delta(newvec , oldvec)); - oldvec = newvec; - } - initoffset = oldvec; - return maxbitas32int(accumulator); - } - - static void packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, __m128i &initoffset) { - __m128i nextoffset = MM_LOAD_SI_128(reinterpret_cast<__m128i *>(in + SIMDBlockSize - 4)); - if (bit < 32) - DeltaProcessor::runDelta(initoffset, in); - if (ArrayDispatch) - ArrayDispatch::SIMDpackwithoutmask(in, reinterpret_cast<__m128i *>(out), bit); - else - simdpackwithoutmask(in, reinterpret_cast<__m128i *>(out), bit); - initoffset = nextoffset; +template struct SIMDBlockPacker { + typedef SIMDDeltaProcessor DeltaProcessor; + static void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, + __m128i &initoffset) { + if (ArrayDispatch) + ArrayDispatch::SIMDunpack(reinterpret_cast(in), out, + bit); + else + simdunpack(reinterpret_cast(in), out, bit); + if (bit < 32) { + initoffset = DeltaProcessor::runPrefixSum(initoffset, out); + } else { + initoffset = + MM_LOAD_SI_128(reinterpret_cast<__m128i *>(out + SIMDBlockSize - 4)); } - - static string name() { - if (ArrayDispatch) - return string("SIMDBlockPackerAD+") + DeltaHelper::name(); - else - return string("SIMDBlockPacker+") + DeltaHelper::name(); + } + + static uint32_t maxbits(const uint32_t *in, __m128i &initoffset) { + const __m128i *pin = reinterpret_cast(in); + __m128i newvec = MM_LOAD_SI_128(pin); + __m128i accumulator = DeltaHelper::Delta(newvec, initoffset); + __m128i oldvec = newvec; + for (uint32_t k = 1; 4 * k < SIMDBlockSize; ++k) { + newvec = MM_LOAD_SI_128(pin + k); + accumulator = + _mm_or_si128(accumulator, DeltaHelper::Delta(newvec, oldvec)); + oldvec = newvec; } - + initoffset = oldvec; + return maxbitas32int(accumulator); + } + + static void packblockwithoutmask(uint32_t *in, uint32_t *out, + const uint32_t bit, __m128i &initoffset) { + __m128i nextoffset = + MM_LOAD_SI_128(reinterpret_cast<__m128i *>(in + SIMDBlockSize - 4)); + if (bit < 32) + DeltaProcessor::runDelta(initoffset, in); + if (ArrayDispatch) + ArrayDispatch::SIMDpackwithoutmask(in, reinterpret_cast<__m128i *>(out), + bit); + else + simdpackwithoutmask(in, reinterpret_cast<__m128i *>(out), bit); + initoffset = nextoffset; + } + + static string name() { + if (ArrayDispatch) + return string("SIMDBlockPackerAD+") + DeltaHelper::name(); + else + return string("SIMDBlockPacker+") + DeltaHelper::name(); + } }; - template struct SIMDIntegratedBlockPacker { - static void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, __m128i &initoffset) { - if (ArrayDispatch) - initoffset = IntegratedArrayDispatch::SIMDiunpack(initoffset, reinterpret_cast(in), out, - bit); - else - initoffset = SIMDiunpack(initoffset, reinterpret_cast(in), out, bit); - } - - - static uint32_t maxbits(const uint32_t *in, __m128i &initoffset) { - const __m128i *pin = reinterpret_cast(in); - __m128i newvec = MM_LOAD_SI_128(pin); - __m128i accumulator = DeltaHelper::Delta(newvec , initoffset); - __m128i oldvec = newvec; - for (uint32_t k = 1; 4 * k < SIMDBlockSize; ++k) { - newvec = MM_LOAD_SI_128(pin + k); - accumulator = _mm_or_si128(accumulator, DeltaHelper::Delta(newvec , oldvec)); - oldvec = newvec; - } - initoffset = oldvec; - return maxbitas32int(accumulator); + static void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, + __m128i &initoffset) { + if (ArrayDispatch) + initoffset = IntegratedArrayDispatch::SIMDiunpack( + initoffset, reinterpret_cast(in), out, bit); + else + initoffset = SIMDiunpack( + initoffset, reinterpret_cast(in), out, bit); + } + + static uint32_t maxbits(const uint32_t *in, __m128i &initoffset) { + const __m128i *pin = reinterpret_cast(in); + __m128i newvec = MM_LOAD_SI_128(pin); + __m128i accumulator = DeltaHelper::Delta(newvec, initoffset); + __m128i oldvec = newvec; + for (uint32_t k = 1; 4 * k < SIMDBlockSize; ++k) { + newvec = MM_LOAD_SI_128(pin + k); + accumulator = + _mm_or_si128(accumulator, DeltaHelper::Delta(newvec, oldvec)); + oldvec = newvec; } - - static void packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, __m128i &initoffset) { - __m128i nextoffset = MM_LOAD_SI_128(reinterpret_cast<__m128i *>(in + SIMDBlockSize - 4)); - if (ArrayDispatch) - IntegratedArrayDispatch::SIMDipackwithoutmask(initoffset, in, reinterpret_cast<__m128i *>(out), bit); - else - SIMDipackwithoutmask(initoffset, in, reinterpret_cast<__m128i *>(out), bit); - initoffset = nextoffset; - } - - static string name() { - if (ArrayDispatch) - return string("SIMDIntegratedBlockPackerAD+") + DeltaHelper::name(); - else - return string("SIMDIntegratedBlockPacker+") + DeltaHelper::name(); - } - + initoffset = oldvec; + return maxbitas32int(accumulator); + } + + static void packblockwithoutmask(uint32_t *in, uint32_t *out, + const uint32_t bit, __m128i &initoffset) { + __m128i nextoffset = + MM_LOAD_SI_128(reinterpret_cast<__m128i *>(in + SIMDBlockSize - 4)); + if (ArrayDispatch) + IntegratedArrayDispatch::SIMDipackwithoutmask( + initoffset, in, reinterpret_cast<__m128i *>(out), bit); + else + SIMDipackwithoutmask(initoffset, in, + reinterpret_cast<__m128i *>(out), bit); + initoffset = nextoffset; + } + + static string name() { + if (ArrayDispatch) + return string("SIMDIntegratedBlockPackerAD+") + DeltaHelper::name(); + else + return string("SIMDIntegratedBlockPacker+") + DeltaHelper::name(); + } }; /** @@ -144,248 +148,254 @@ struct SIMDIntegratedBlockPacker { * integers. * */ -template -class SIMDBinaryPacking: public IntegerCODEC { +template class SIMDBinaryPacking : public IntegerCODEC { public: #ifdef USE_ALIGNED - static const uint32_t CookiePadder = 123456;// just some made up number + static const uint32_t CookiePadder = 123456; // just some made up number #endif - static const uint32_t MiniBlockSize = 128; - static const uint32_t HowManyMiniBlocks = 16; - static const uint32_t BlockSize = MiniBlockSize;//HowManyMiniBlocks * MiniBlockSize; - - - - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - checkifdivisibleby(length, BlockSize); - const uint32_t *const initout(out); + static const uint32_t MiniBlockSize = 128; + static const uint32_t HowManyMiniBlocks = 16; + static const uint32_t BlockSize = + MiniBlockSize; // HowManyMiniBlocks * MiniBlockSize; + + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + checkifdivisibleby(length, BlockSize); + const uint32_t *const initout(out); #ifdef USE_ALIGNED - if (needPaddingTo128Bits(out) - or needPaddingTo128Bits(in)) throw - std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + if (needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) + throw std::runtime_error( + "alignment issue: pointers should be aligned on 128-bit boundaries"); #endif - *out++ = static_cast(length); + *out++ = static_cast(length); #ifdef USE_ALIGNED - while (needPaddingTo128Bits(out)) *out++ = CookiePadder; + while (needPaddingTo128Bits(out)) + *out++ = CookiePadder; #endif - uint32_t Bs[HowManyMiniBlocks]; - __m128i init = _mm_set1_epi32(0); - const uint32_t *const final = in + length; - for (; in + HowManyMiniBlocks * MiniBlockSize - <= final; in += HowManyMiniBlocks * MiniBlockSize) { - __m128i tmpinit = init; - for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { - Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); - } - *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) - | Bs[3]; - *out++ = (Bs[4] << 24) | (Bs[5] << 16) | (Bs[6] << 8) - | Bs[7]; - *out++ = (Bs[8] << 24) | (Bs[9] << 16) | (Bs[10] << 8) - | Bs[11]; - *out++ = (Bs[12] << 24) | (Bs[13] << 16) | (Bs[14] << 8) - | Bs[15]; - for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { - BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], init); - out += MiniBlockSize / 32 * Bs[i]; - } - } - if (in < final) { - const size_t howmany = (final - in) / MiniBlockSize; - __m128i tmpinit = init; - memset(&Bs[0], 0, HowManyMiniBlocks * sizeof(uint32_t)); - for (uint32_t i = 0; i < howmany; ++i) { - Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); - } - *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) - | Bs[3]; - *out++ = (Bs[4] << 24) | (Bs[5] << 16) | (Bs[6] << 8) - | Bs[7]; - *out++ = (Bs[8] << 24) | (Bs[9] << 16) | (Bs[10] << 8) - | Bs[11]; - *out++ = (Bs[12] << 24) | (Bs[13] << 16) | (Bs[14] << 8) - | Bs[15]; - for (uint32_t i = 0; i < howmany; ++i) { - BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], init); - out += MiniBlockSize / 32 * Bs[i]; - } - in += howmany * MiniBlockSize; - assert(in == final); - } - nvalue = out - initout; + uint32_t Bs[HowManyMiniBlocks]; + __m128i init = _mm_set1_epi32(0); + const uint32_t *const final = in + length; + for (; in + HowManyMiniBlocks * MiniBlockSize <= final; + in += HowManyMiniBlocks * MiniBlockSize) { + __m128i tmpinit = init; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); + } + *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) | Bs[3]; + *out++ = (Bs[4] << 24) | (Bs[5] << 16) | (Bs[6] << 8) | Bs[7]; + *out++ = (Bs[8] << 24) | (Bs[9] << 16) | (Bs[10] << 8) | Bs[11]; + *out++ = (Bs[12] << 24) | (Bs[13] << 16) | (Bs[14] << 8) | Bs[15]; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], + init); + out += MiniBlockSize / 32 * Bs[i]; + } + } + if (in < final) { + const size_t howmany = (final - in) / MiniBlockSize; + __m128i tmpinit = init; + memset(&Bs[0], 0, HowManyMiniBlocks * sizeof(uint32_t)); + for (uint32_t i = 0; i < howmany; ++i) { + Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); + } + *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) | Bs[3]; + *out++ = (Bs[4] << 24) | (Bs[5] << 16) | (Bs[6] << 8) | Bs[7]; + *out++ = (Bs[8] << 24) | (Bs[9] << 16) | (Bs[10] << 8) | Bs[11]; + *out++ = (Bs[12] << 24) | (Bs[13] << 16) | (Bs[14] << 8) | Bs[15]; + for (uint32_t i = 0; i < howmany; ++i) { + BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], + init); + out += MiniBlockSize / 32 * Bs[i]; + } + in += howmany * MiniBlockSize; + assert(in == final); } + nvalue = out - initout; + } - const uint32_t *decodeArray(const uint32_t *in, const size_t /*length*/, - uint32_t *out, size_t &nvalue) { + const uint32_t *decodeArray(const uint32_t *in, const size_t /*length*/, + uint32_t *out, size_t &nvalue) { #ifdef USE_ALIGNED - if (needPaddingTo128Bits(out) - or needPaddingTo128Bits(in)) throw - std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + if (needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) + throw std::runtime_error( + "alignment issue: pointers should be aligned on 128-bit boundaries"); #endif - const uint32_t actuallength = *in++; + const uint32_t actuallength = *in++; #ifdef USE_ALIGNED - while (needPaddingTo128Bits(in)) { - if (in[0] != CookiePadder) throw logic_error("SIMDBinaryPacking alignment issue."); - ++in; - } + while (needPaddingTo128Bits(in)) { + if (in[0] != CookiePadder) + throw logic_error("SIMDBinaryPacking alignment issue."); + ++in; + } #endif - const uint32_t *const initout(out); - uint32_t Bs[HowManyMiniBlocks]; - __m128i init = _mm_set1_epi32(0); - for (; out < initout + actuallength / (HowManyMiniBlocks * MiniBlockSize) *HowManyMiniBlocks * MiniBlockSize ; - out += HowManyMiniBlocks * MiniBlockSize) { - for (uint32_t i = 0; i < 4 ; ++i, ++in) { - Bs[0 + 4 * i] = static_cast(in[0] >> 24); - Bs[1 + 4 * i] = static_cast(in[0] >> 16); - Bs[2 + 4 * i] = static_cast(in[0] >> 8); - Bs[3 + 4 * i] = static_cast(in[0]); - } - for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { - BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init); - in += MiniBlockSize / 32 * Bs[i]; - } - } - - if (out < initout + actuallength) { - const size_t howmany = (initout + actuallength - out) / MiniBlockSize; - for (uint32_t i = 0; i < 4 ; ++i, ++in) { - Bs[0 + 4 * i] = static_cast(in[0] >> 24); - Bs[1 + 4 * i] = static_cast(in[0] >> 16); - Bs[2 + 4 * i] = static_cast(in[0] >> 8); - Bs[3 + 4 * i] = static_cast(in[0]); - } - for (uint32_t i = 0; i < howmany; ++i) { - BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init); - in += MiniBlockSize / 32 * Bs[i]; - } - out += howmany * MiniBlockSize; - assert(out == initout + actuallength); - } - nvalue = out - initout; - return in; + const uint32_t *const initout(out); + uint32_t Bs[HowManyMiniBlocks]; + __m128i init = _mm_set1_epi32(0); + for (; out < initout + + actuallength / (HowManyMiniBlocks * MiniBlockSize) * + HowManyMiniBlocks * MiniBlockSize; + out += HowManyMiniBlocks * MiniBlockSize) { + for (uint32_t i = 0; i < 4; ++i, ++in) { + Bs[0 + 4 * i] = static_cast(in[0] >> 24); + Bs[1 + 4 * i] = static_cast(in[0] >> 16); + Bs[2 + 4 * i] = static_cast(in[0] >> 8); + Bs[3 + 4 * i] = static_cast(in[0]); + } + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init); + in += MiniBlockSize / 32 * Bs[i]; + } } - // Returns a decompressed value in an encoded array - // could be greatly optimized in the non-differential coding case: currently just for delta coding - // WARNING: THIS IMPLEMENTATION WILL ONLY PROVIDE THE CORRECT RESULT - // WHEN USING REGULAR (D1) DIFFERENTIAL CODING. TODO: Generalize the support. TODO: Should check the type. - uint32_t select(uint32_t *in, size_t index) { + if (out < initout + actuallength) { + const size_t howmany = (initout + actuallength - out) / MiniBlockSize; + for (uint32_t i = 0; i < 4; ++i, ++in) { + Bs[0 + 4 * i] = static_cast(in[0] >> 24); + Bs[1 + 4 * i] = static_cast(in[0] >> 16); + Bs[2 + 4 * i] = static_cast(in[0] >> 8); + Bs[3 + 4 * i] = static_cast(in[0]); + } + for (uint32_t i = 0; i < howmany; ++i) { + BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init); + in += MiniBlockSize / 32 * Bs[i]; + } + out += howmany * MiniBlockSize; + assert(out == initout + actuallength); + } + nvalue = out - initout; + return in; + } + + // Returns a decompressed value in an encoded array + // could be greatly optimized in the non-differential coding case: currently + // just for delta coding + // WARNING: THIS IMPLEMENTATION WILL ONLY PROVIDE THE CORRECT RESULT + // WHEN USING REGULAR (D1) DIFFERENTIAL CODING. TODO: Generalize the + // support. TODO: Should check the type. + uint32_t select(uint32_t *in, size_t index) { #ifdef USE_ALIGNED - if (needPaddingTo128Bits(in)) throw - std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + if (needPaddingTo128Bits(in)) + throw std::runtime_error( + "alignment issue: pointers should be aligned on 128-bit boundaries"); #endif - const uint32_t actuallength = *in++; + const uint32_t actuallength = *in++; #ifdef USE_ALIGNED - while (needPaddingTo128Bits(in)) { - if (in[0] != CookiePadder) throw logic_error("SIMDBinaryPacking alignment issue."); - ++in; - } + while (needPaddingTo128Bits(in)) { + if (in[0] != CookiePadder) + throw logic_error("SIMDBinaryPacking alignment issue."); + ++in; + } #endif - uint32_t Bs[HowManyMiniBlocks]; - __m128i init = _mm_set1_epi32(0); - size_t runningindex = 0; - for (; runningindex < actuallength / (HowManyMiniBlocks * MiniBlockSize) *HowManyMiniBlocks * MiniBlockSize ;) { - for (uint32_t i = 0; i < 4 ; ++i, ++in) { - Bs[0 + 4 * i] = static_cast(in[0] >> 24); - Bs[1 + 4 * i] = static_cast(in[0] >> 16); - Bs[2 + 4 * i] = static_cast(in[0] >> 8); - Bs[3 + 4 * i] = static_cast(in[0]); - } - for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { - if(runningindex + 128 > index) { - return simdselectd1(&init, (const __m128i *)in, Bs[i], - static_cast(index - runningindex)); - } - simdscand1(&init, (const __m128i *)in, Bs[i]); - runningindex += MiniBlockSize; - in += MiniBlockSize / 32 * Bs[i]; - } + uint32_t Bs[HowManyMiniBlocks]; + __m128i init = _mm_set1_epi32(0); + size_t runningindex = 0; + for (; runningindex < actuallength / (HowManyMiniBlocks * MiniBlockSize) * + HowManyMiniBlocks * MiniBlockSize;) { + for (uint32_t i = 0; i < 4; ++i, ++in) { + Bs[0 + 4 * i] = static_cast(in[0] >> 24); + Bs[1 + 4 * i] = static_cast(in[0] >> 16); + Bs[2 + 4 * i] = static_cast(in[0] >> 8); + Bs[3 + 4 * i] = static_cast(in[0]); + } + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + if (runningindex + 128 > index) { + return simdselectd1(&init, (const __m128i *)in, Bs[i], + static_cast(index - runningindex)); } + simdscand1(&init, (const __m128i *)in, Bs[i]); + runningindex += MiniBlockSize; + in += MiniBlockSize / 32 * Bs[i]; + } + } - if (runningindex < actuallength) { - const size_t howmany = (actuallength - runningindex ) / MiniBlockSize; - for (uint32_t i = 0; i < 4 ; ++i, ++in) { - Bs[0 + 4 * i] = static_cast(in[0] >> 24); - Bs[1 + 4 * i] = static_cast(in[0] >> 16); - Bs[2 + 4 * i] = static_cast(in[0] >> 8); - Bs[3 + 4 * i] = static_cast(in[0]); - } - for (uint32_t i = 0; i < howmany; ++i) { - if(runningindex + 128 > index) { - return simdselectd1(&init, (const __m128i *)in, Bs[i], - static_cast(index - runningindex)); - } - simdscand1(&init, (const __m128i *)in, Bs[i]); - runningindex += MiniBlockSize; - in += MiniBlockSize / 32 * Bs[i]; - } + if (runningindex < actuallength) { + const size_t howmany = (actuallength - runningindex) / MiniBlockSize; + for (uint32_t i = 0; i < 4; ++i, ++in) { + Bs[0 + 4 * i] = static_cast(in[0] >> 24); + Bs[1 + 4 * i] = static_cast(in[0] >> 16); + Bs[2 + 4 * i] = static_cast(in[0] >> 8); + Bs[3 + 4 * i] = static_cast(in[0]); + } + for (uint32_t i = 0; i < howmany; ++i) { + if (runningindex + 128 > index) { + return simdselectd1(&init, (const __m128i *)in, Bs[i], + static_cast(index - runningindex)); } - return static_cast(runningindex); + simdscand1(&init, (const __m128i *)in, Bs[i]); + runningindex += MiniBlockSize; + in += MiniBlockSize / 32 * Bs[i]; + } } - - - // Performs a lower bound find in the encoded array. - // Returns the index - // WARNING: THIS IMPLEMENTATION WILL ONLY PROVIDE THE CORRECT RESULT - // WHEN USING REGULAR (D1) DIFFERENTIAL CODING. TODO: Generalize the support. TODO: Should check the type. - size_t findLowerBound(const uint32_t *in, const size_t /*length*/, uint32_t key, - uint32_t *presult) { + return static_cast(runningindex); + } + + // Performs a lower bound find in the encoded array. + // Returns the index + // WARNING: THIS IMPLEMENTATION WILL ONLY PROVIDE THE CORRECT RESULT + // WHEN USING REGULAR (D1) DIFFERENTIAL CODING. TODO: Generalize the + // support. TODO: Should check the type. + size_t findLowerBound(const uint32_t *in, const size_t /*length*/, + uint32_t key, uint32_t *presult) { #ifdef USE_ALIGNED - if (needPaddingTo128Bits(in)) throw - std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + if (needPaddingTo128Bits(in)) + throw std::runtime_error( + "alignment issue: pointers should be aligned on 128-bit boundaries"); #endif - const uint32_t actuallength = *in++; + const uint32_t actuallength = *in++; #ifdef USE_ALIGNED - while (needPaddingTo128Bits(in)) { - if (in[0] != CookiePadder) throw logic_error("SIMDBinaryPacking alignment issue."); - ++in; - } + while (needPaddingTo128Bits(in)) { + if (in[0] != CookiePadder) + throw logic_error("SIMDBinaryPacking alignment issue."); + ++in; + } #endif - uint32_t Bs[HowManyMiniBlocks]; - __m128i init = _mm_set1_epi32(0); - size_t runningindex = 0; - for (; runningindex < actuallength / (HowManyMiniBlocks * MiniBlockSize) *HowManyMiniBlocks * MiniBlockSize ;) { - for (uint32_t i = 0; i < 4 ; ++i, ++in) { - Bs[0 + 4 * i] = static_cast(in[0] >> 24); - Bs[1 + 4 * i] = static_cast(in[0] >> 16); - Bs[2 + 4 * i] = static_cast(in[0] >> 8); - Bs[3 + 4 * i] = static_cast(in[0]); - } - for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { - size_t index = simdsearchd1(&init, (const __m128i* ) in, Bs[i], - key, presult); - runningindex += index; - if(index < MiniBlockSize) return runningindex; - in += MiniBlockSize / 32 * Bs[i]; - } - } - - if (runningindex < actuallength) { - const size_t howmany = (actuallength - runningindex ) / MiniBlockSize; - for (uint32_t i = 0; i < 4 ; ++i, ++in) { - Bs[0 + 4 * i] = static_cast(in[0] >> 24); - Bs[1 + 4 * i] = static_cast(in[0] >> 16); - Bs[2 + 4 * i] = static_cast(in[0] >> 8); - Bs[3 + 4 * i] = static_cast(in[0]); - } - for (uint32_t i = 0; i < howmany; ++i) { - size_t index = simdsearchd1(&init, (const __m128i* ) in, Bs[i], - key, presult); - runningindex += index; - if(index < MiniBlockSize) return runningindex; - in += MiniBlockSize / 32 * Bs[i]; - } - } - return runningindex; + uint32_t Bs[HowManyMiniBlocks]; + __m128i init = _mm_set1_epi32(0); + size_t runningindex = 0; + for (; runningindex < actuallength / (HowManyMiniBlocks * MiniBlockSize) * + HowManyMiniBlocks * MiniBlockSize;) { + for (uint32_t i = 0; i < 4; ++i, ++in) { + Bs[0 + 4 * i] = static_cast(in[0] >> 24); + Bs[1 + 4 * i] = static_cast(in[0] >> 16); + Bs[2 + 4 * i] = static_cast(in[0] >> 8); + Bs[3 + 4 * i] = static_cast(in[0]); + } + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + size_t index = + simdsearchd1(&init, (const __m128i *)in, Bs[i], key, presult); + runningindex += index; + if (index < MiniBlockSize) + return runningindex; + in += MiniBlockSize / 32 * Bs[i]; + } } - string name() const { - ostringstream convert; - convert << "SIMDBinaryPacking" << "With" << BlockPacker::name() << MiniBlockSize; - return convert.str(); + if (runningindex < actuallength) { + const size_t howmany = (actuallength - runningindex) / MiniBlockSize; + for (uint32_t i = 0; i < 4; ++i, ++in) { + Bs[0 + 4 * i] = static_cast(in[0] >> 24); + Bs[1 + 4 * i] = static_cast(in[0] >> 16); + Bs[2 + 4 * i] = static_cast(in[0] >> 8); + Bs[3 + 4 * i] = static_cast(in[0]); + } + for (uint32_t i = 0; i < howmany; ++i) { + size_t index = + simdsearchd1(&init, (const __m128i *)in, Bs[i], key, presult); + runningindex += index; + if (index < MiniBlockSize) + return runningindex; + in += MiniBlockSize / 32 * Bs[i]; + } } - + return runningindex; + } + + string name() const { + ostringstream convert; + convert << "SIMDBinaryPacking" + << "With" << BlockPacker::name() << MiniBlockSize; + return convert.str(); + } }; } // namespace SIMDCompressionLib diff --git a/include/simdbitpacking.h b/include/simdbitpacking.h index d95bf31..f3a4e3d 100644 --- a/include/simdbitpacking.h +++ b/include/simdbitpacking.h @@ -11,39 +11,38 @@ namespace SIMDCompressionLib { - -void __SIMD_fastunpack1(const __m128i *, uint32_t *); -void __SIMD_fastunpack2(const __m128i *, uint32_t *); -void __SIMD_fastunpack3(const __m128i *, uint32_t *); -void __SIMD_fastunpack4(const __m128i *, uint32_t *); -void __SIMD_fastunpack5(const __m128i *, uint32_t *); -void __SIMD_fastunpack6(const __m128i *, uint32_t *); -void __SIMD_fastunpack7(const __m128i *, uint32_t *); -void __SIMD_fastunpack8(const __m128i *, uint32_t *); -void __SIMD_fastunpack9(const __m128i *, uint32_t *); -void __SIMD_fastunpack10(const __m128i *, uint32_t *); -void __SIMD_fastunpack11(const __m128i *, uint32_t *); -void __SIMD_fastunpack12(const __m128i *, uint32_t *); -void __SIMD_fastunpack13(const __m128i *, uint32_t *); -void __SIMD_fastunpack14(const __m128i *, uint32_t *); -void __SIMD_fastunpack15(const __m128i *, uint32_t *); -void __SIMD_fastunpack16(const __m128i *, uint32_t *); -void __SIMD_fastunpack17(const __m128i *, uint32_t *); -void __SIMD_fastunpack18(const __m128i *, uint32_t *); -void __SIMD_fastunpack19(const __m128i *, uint32_t *); -void __SIMD_fastunpack20(const __m128i *, uint32_t *); -void __SIMD_fastunpack21(const __m128i *, uint32_t *); -void __SIMD_fastunpack22(const __m128i *, uint32_t *); -void __SIMD_fastunpack23(const __m128i *, uint32_t *); -void __SIMD_fastunpack24(const __m128i *, uint32_t *); -void __SIMD_fastunpack25(const __m128i *, uint32_t *); -void __SIMD_fastunpack26(const __m128i *, uint32_t *); -void __SIMD_fastunpack27(const __m128i *, uint32_t *); -void __SIMD_fastunpack28(const __m128i *, uint32_t *); -void __SIMD_fastunpack29(const __m128i *, uint32_t *); -void __SIMD_fastunpack30(const __m128i *, uint32_t *); -void __SIMD_fastunpack31(const __m128i *, uint32_t *); -void __SIMD_fastunpack32(const __m128i *, uint32_t *); +void __SIMD_fastunpack1(const __m128i *, uint32_t *); +void __SIMD_fastunpack2(const __m128i *, uint32_t *); +void __SIMD_fastunpack3(const __m128i *, uint32_t *); +void __SIMD_fastunpack4(const __m128i *, uint32_t *); +void __SIMD_fastunpack5(const __m128i *, uint32_t *); +void __SIMD_fastunpack6(const __m128i *, uint32_t *); +void __SIMD_fastunpack7(const __m128i *, uint32_t *); +void __SIMD_fastunpack8(const __m128i *, uint32_t *); +void __SIMD_fastunpack9(const __m128i *, uint32_t *); +void __SIMD_fastunpack10(const __m128i *, uint32_t *); +void __SIMD_fastunpack11(const __m128i *, uint32_t *); +void __SIMD_fastunpack12(const __m128i *, uint32_t *); +void __SIMD_fastunpack13(const __m128i *, uint32_t *); +void __SIMD_fastunpack14(const __m128i *, uint32_t *); +void __SIMD_fastunpack15(const __m128i *, uint32_t *); +void __SIMD_fastunpack16(const __m128i *, uint32_t *); +void __SIMD_fastunpack17(const __m128i *, uint32_t *); +void __SIMD_fastunpack18(const __m128i *, uint32_t *); +void __SIMD_fastunpack19(const __m128i *, uint32_t *); +void __SIMD_fastunpack20(const __m128i *, uint32_t *); +void __SIMD_fastunpack21(const __m128i *, uint32_t *); +void __SIMD_fastunpack22(const __m128i *, uint32_t *); +void __SIMD_fastunpack23(const __m128i *, uint32_t *); +void __SIMD_fastunpack24(const __m128i *, uint32_t *); +void __SIMD_fastunpack25(const __m128i *, uint32_t *); +void __SIMD_fastunpack26(const __m128i *, uint32_t *); +void __SIMD_fastunpack27(const __m128i *, uint32_t *); +void __SIMD_fastunpack28(const __m128i *, uint32_t *); +void __SIMD_fastunpack29(const __m128i *, uint32_t *); +void __SIMD_fastunpack30(const __m128i *, uint32_t *); +void __SIMD_fastunpack31(const __m128i *, uint32_t *); +void __SIMD_fastunpack32(const __m128i *, uint32_t *); void __SIMD_fastpackwithoutmask0(const uint32_t *, __m128i *); void __SIMD_fastpackwithoutmask1(const uint32_t *, __m128i *); @@ -113,9 +112,6 @@ void __SIMD_fastpack30(const uint32_t *, __m128i *); void __SIMD_fastpack31(const uint32_t *, __m128i *); void __SIMD_fastpack32(const uint32_t *, __m128i *); - - - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_SIMDBITPACKING_H_ */ diff --git a/include/simdbitpackinghelpers.h b/include/simdbitpackinghelpers.h index 2cd8727..27c66fe 100644 --- a/include/simdbitpackinghelpers.h +++ b/include/simdbitpackinghelpers.h @@ -19,635 +19,1136 @@ namespace SIMDCompressionLib { const size_t SIMDBlockSize = 128; - -void SIMD_nullunpacker32(const __m128i *__restrict__ , uint32_t *__restrict__ out) { - memset(out, 0, 32 * 4 * 4); +void SIMD_nullunpacker32(const __m128i *__restrict__, + uint32_t *__restrict__ out) { + memset(out, 0, 32 * 4 * 4); } -void uSIMD_nullunpacker32(const __m128i *__restrict__ , uint32_t *__restrict__ out) { - memset(out, 0, 32 * 4 * 4); +void uSIMD_nullunpacker32(const __m128i *__restrict__, + uint32_t *__restrict__ out) { + memset(out, 0, 32 * 4 * 4); } +void simdunpack(const __m128i *__restrict__ in, uint32_t *__restrict__ out, + const uint32_t bit) { + switch (bit) { + case 0: + SIMD_nullunpacker32(in, out); + return; -void simdunpack(const __m128i *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - switch (bit) { - case 0: SIMD_nullunpacker32(in, out); return; - - case 1: __SIMD_fastunpack1(in, out); return; - - case 2: __SIMD_fastunpack2(in, out); return; - - case 3: __SIMD_fastunpack3(in, out); return; - - case 4: __SIMD_fastunpack4(in, out); return; - - case 5: __SIMD_fastunpack5(in, out); return; - - case 6: __SIMD_fastunpack6(in, out); return; - - case 7: __SIMD_fastunpack7(in, out); return; - - case 8: __SIMD_fastunpack8(in, out); return; - - case 9: __SIMD_fastunpack9(in, out); return; - - case 10: __SIMD_fastunpack10(in, out); return; - - case 11: __SIMD_fastunpack11(in, out); return; - - case 12: __SIMD_fastunpack12(in, out); return; - - case 13: __SIMD_fastunpack13(in, out); return; - - case 14: __SIMD_fastunpack14(in, out); return; - - case 15: __SIMD_fastunpack15(in, out); return; + case 1: + __SIMD_fastunpack1(in, out); + return; - case 16: __SIMD_fastunpack16(in, out); return; + case 2: + __SIMD_fastunpack2(in, out); + return; - case 17: __SIMD_fastunpack17(in, out); return; + case 3: + __SIMD_fastunpack3(in, out); + return; - case 18: __SIMD_fastunpack18(in, out); return; + case 4: + __SIMD_fastunpack4(in, out); + return; - case 19: __SIMD_fastunpack19(in, out); return; + case 5: + __SIMD_fastunpack5(in, out); + return; - case 20: __SIMD_fastunpack20(in, out); return; + case 6: + __SIMD_fastunpack6(in, out); + return; - case 21: __SIMD_fastunpack21(in, out); return; + case 7: + __SIMD_fastunpack7(in, out); + return; - case 22: __SIMD_fastunpack22(in, out); return; + case 8: + __SIMD_fastunpack8(in, out); + return; + + case 9: + __SIMD_fastunpack9(in, out); + return; + + case 10: + __SIMD_fastunpack10(in, out); + return; + + case 11: + __SIMD_fastunpack11(in, out); + return; + + case 12: + __SIMD_fastunpack12(in, out); + return; + + case 13: + __SIMD_fastunpack13(in, out); + return; + + case 14: + __SIMD_fastunpack14(in, out); + return; + + case 15: + __SIMD_fastunpack15(in, out); + return; + + case 16: + __SIMD_fastunpack16(in, out); + return; + + case 17: + __SIMD_fastunpack17(in, out); + return; + + case 18: + __SIMD_fastunpack18(in, out); + return; + + case 19: + __SIMD_fastunpack19(in, out); + return; + + case 20: + __SIMD_fastunpack20(in, out); + return; + + case 21: + __SIMD_fastunpack21(in, out); + return; + + case 22: + __SIMD_fastunpack22(in, out); + return; + + case 23: + __SIMD_fastunpack23(in, out); + return; + + case 24: + __SIMD_fastunpack24(in, out); + return; + + case 25: + __SIMD_fastunpack25(in, out); + return; - case 23: __SIMD_fastunpack23(in, out); return; + case 26: + __SIMD_fastunpack26(in, out); + return; - case 24: __SIMD_fastunpack24(in, out); return; + case 27: + __SIMD_fastunpack27(in, out); + return; - case 25: __SIMD_fastunpack25(in, out); return; + case 28: + __SIMD_fastunpack28(in, out); + return; - case 26: __SIMD_fastunpack26(in, out); return; + case 29: + __SIMD_fastunpack29(in, out); + return; - case 27: __SIMD_fastunpack27(in, out); return; + case 30: + __SIMD_fastunpack30(in, out); + return; - case 28: __SIMD_fastunpack28(in, out); return; + case 31: + __SIMD_fastunpack31(in, out); + return; - case 29: __SIMD_fastunpack29(in, out); return; + case 32: + __SIMD_fastunpack32(in, out); + return; - case 30: __SIMD_fastunpack30(in, out); return; - - case 31: __SIMD_fastunpack31(in, out); return; - - case 32: __SIMD_fastunpack32(in, out); return; - - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } - - /*assumes that integers fit in the prescribed number of bits*/ -void simdpackwithoutmask(const uint32_t *__restrict__ in, __m128i *__restrict__ out, const uint32_t bit) { - switch (bit) { - case 0: return; - - case 1: __SIMD_fastpackwithoutmask1(in, out); return; - - case 2: __SIMD_fastpackwithoutmask2(in, out); return; - - case 3: __SIMD_fastpackwithoutmask3(in, out); return; - - case 4: __SIMD_fastpackwithoutmask4(in, out); return; - - case 5: __SIMD_fastpackwithoutmask5(in, out); return; - - case 6: __SIMD_fastpackwithoutmask6(in, out); return; - - case 7: __SIMD_fastpackwithoutmask7(in, out); return; - - case 8: __SIMD_fastpackwithoutmask8(in, out); return; - - case 9: __SIMD_fastpackwithoutmask9(in, out); return; +void simdpackwithoutmask(const uint32_t *__restrict__ in, + __m128i *__restrict__ out, const uint32_t bit) { + switch (bit) { + case 0: + return; - case 10: __SIMD_fastpackwithoutmask10(in, out); return; + case 1: + __SIMD_fastpackwithoutmask1(in, out); + return; - case 11: __SIMD_fastpackwithoutmask11(in, out); return; + case 2: + __SIMD_fastpackwithoutmask2(in, out); + return; - case 12: __SIMD_fastpackwithoutmask12(in, out); return; + case 3: + __SIMD_fastpackwithoutmask3(in, out); + return; - case 13: __SIMD_fastpackwithoutmask13(in, out); return; + case 4: + __SIMD_fastpackwithoutmask4(in, out); + return; - case 14: __SIMD_fastpackwithoutmask14(in, out); return; + case 5: + __SIMD_fastpackwithoutmask5(in, out); + return; - case 15: __SIMD_fastpackwithoutmask15(in, out); return; + case 6: + __SIMD_fastpackwithoutmask6(in, out); + return; - case 16: __SIMD_fastpackwithoutmask16(in, out); return; + case 7: + __SIMD_fastpackwithoutmask7(in, out); + return; - case 17: __SIMD_fastpackwithoutmask17(in, out); return; + case 8: + __SIMD_fastpackwithoutmask8(in, out); + return; + + case 9: + __SIMD_fastpackwithoutmask9(in, out); + return; + + case 10: + __SIMD_fastpackwithoutmask10(in, out); + return; + + case 11: + __SIMD_fastpackwithoutmask11(in, out); + return; + + case 12: + __SIMD_fastpackwithoutmask12(in, out); + return; + + case 13: + __SIMD_fastpackwithoutmask13(in, out); + return; + + case 14: + __SIMD_fastpackwithoutmask14(in, out); + return; + + case 15: + __SIMD_fastpackwithoutmask15(in, out); + return; + + case 16: + __SIMD_fastpackwithoutmask16(in, out); + return; + + case 17: + __SIMD_fastpackwithoutmask17(in, out); + return; + + case 18: + __SIMD_fastpackwithoutmask18(in, out); + return; + + case 19: + __SIMD_fastpackwithoutmask19(in, out); + return; + + case 20: + __SIMD_fastpackwithoutmask20(in, out); + return; + + case 21: + __SIMD_fastpackwithoutmask21(in, out); + return; + + case 22: + __SIMD_fastpackwithoutmask22(in, out); + return; + + case 23: + __SIMD_fastpackwithoutmask23(in, out); + return; + + case 24: + __SIMD_fastpackwithoutmask24(in, out); + return; + + case 25: + __SIMD_fastpackwithoutmask25(in, out); + return; - case 18: __SIMD_fastpackwithoutmask18(in, out); return; + case 26: + __SIMD_fastpackwithoutmask26(in, out); + return; - case 19: __SIMD_fastpackwithoutmask19(in, out); return; + case 27: + __SIMD_fastpackwithoutmask27(in, out); + return; - case 20: __SIMD_fastpackwithoutmask20(in, out); return; + case 28: + __SIMD_fastpackwithoutmask28(in, out); + return; - case 21: __SIMD_fastpackwithoutmask21(in, out); return; + case 29: + __SIMD_fastpackwithoutmask29(in, out); + return; - case 22: __SIMD_fastpackwithoutmask22(in, out); return; + case 30: + __SIMD_fastpackwithoutmask30(in, out); + return; - case 23: __SIMD_fastpackwithoutmask23(in, out); return; + case 31: + __SIMD_fastpackwithoutmask31(in, out); + return; - case 24: __SIMD_fastpackwithoutmask24(in, out); return; + case 32: + __SIMD_fastpackwithoutmask32(in, out); + return; - case 25: __SIMD_fastpackwithoutmask25(in, out); return; - - case 26: __SIMD_fastpackwithoutmask26(in, out); return; - - case 27: __SIMD_fastpackwithoutmask27(in, out); return; - - case 28: __SIMD_fastpackwithoutmask28(in, out); return; - - case 29: __SIMD_fastpackwithoutmask29(in, out); return; - - case 30: __SIMD_fastpackwithoutmask30(in, out); return; - - case 31: __SIMD_fastpackwithoutmask31(in, out); return; - - case 32: __SIMD_fastpackwithoutmask32(in, out); return; - - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } /*assumes that integers fit in the prescribed number of bits*/ -void simdpack(const uint32_t *__restrict__ in, __m128i *__restrict__ out, const uint32_t bit) { - switch (bit) { - case 0: return; - - case 1: __SIMD_fastpack1(in, out); return; - - case 2: __SIMD_fastpack2(in, out); return; - - case 3: __SIMD_fastpack3(in, out); return; - - case 4: __SIMD_fastpack4(in, out); return; - - case 5: __SIMD_fastpack5(in, out); return; - - case 6: __SIMD_fastpack6(in, out); return; - - case 7: __SIMD_fastpack7(in, out); return; +void simdpack(const uint32_t *__restrict__ in, __m128i *__restrict__ out, + const uint32_t bit) { + switch (bit) { + case 0: + return; - case 8: __SIMD_fastpack8(in, out); return; + case 1: + __SIMD_fastpack1(in, out); + return; - case 9: __SIMD_fastpack9(in, out); return; + case 2: + __SIMD_fastpack2(in, out); + return; - case 10: __SIMD_fastpack10(in, out); return; + case 3: + __SIMD_fastpack3(in, out); + return; - case 11: __SIMD_fastpack11(in, out); return; + case 4: + __SIMD_fastpack4(in, out); + return; - case 12: __SIMD_fastpack12(in, out); return; + case 5: + __SIMD_fastpack5(in, out); + return; - case 13: __SIMD_fastpack13(in, out); return; + case 6: + __SIMD_fastpack6(in, out); + return; - case 14: __SIMD_fastpack14(in, out); return; + case 7: + __SIMD_fastpack7(in, out); + return; - case 15: __SIMD_fastpack15(in, out); return; + case 8: + __SIMD_fastpack8(in, out); + return; + + case 9: + __SIMD_fastpack9(in, out); + return; + + case 10: + __SIMD_fastpack10(in, out); + return; + + case 11: + __SIMD_fastpack11(in, out); + return; + + case 12: + __SIMD_fastpack12(in, out); + return; + + case 13: + __SIMD_fastpack13(in, out); + return; + + case 14: + __SIMD_fastpack14(in, out); + return; + + case 15: + __SIMD_fastpack15(in, out); + return; + + case 16: + __SIMD_fastpack16(in, out); + return; + + case 17: + __SIMD_fastpack17(in, out); + return; + + case 18: + __SIMD_fastpack18(in, out); + return; + + case 19: + __SIMD_fastpack19(in, out); + return; + + case 20: + __SIMD_fastpack20(in, out); + return; + + case 21: + __SIMD_fastpack21(in, out); + return; + + case 22: + __SIMD_fastpack22(in, out); + return; + + case 23: + __SIMD_fastpack23(in, out); + return; + + case 24: + __SIMD_fastpack24(in, out); + return; + + case 25: + __SIMD_fastpack25(in, out); + return; - case 16: __SIMD_fastpack16(in, out); return; + case 26: + __SIMD_fastpack26(in, out); + return; - case 17: __SIMD_fastpack17(in, out); return; + case 27: + __SIMD_fastpack27(in, out); + return; - case 18: __SIMD_fastpack18(in, out); return; + case 28: + __SIMD_fastpack28(in, out); + return; - case 19: __SIMD_fastpack19(in, out); return; + case 29: + __SIMD_fastpack29(in, out); + return; - case 20: __SIMD_fastpack20(in, out); return; + case 30: + __SIMD_fastpack30(in, out); + return; - case 21: __SIMD_fastpack21(in, out); return; + case 31: + __SIMD_fastpack31(in, out); + return; - case 22: __SIMD_fastpack22(in, out); return; + case 32: + __SIMD_fastpack32(in, out); + return; - case 23: __SIMD_fastpack23(in, out); return; - - case 24: __SIMD_fastpack24(in, out); return; - - case 25: __SIMD_fastpack25(in, out); return; - - case 26: __SIMD_fastpack26(in, out); return; - - case 27: __SIMD_fastpack27(in, out); return; - - case 28: __SIMD_fastpack28(in, out); return; - - case 29: __SIMD_fastpack29(in, out); return; - - case 30: __SIMD_fastpack30(in, out); return; - - case 31: __SIMD_fastpack31(in, out); return; - - case 32: __SIMD_fastpack32(in, out); return; - - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } +void usimdunpack(const __m128i *__restrict__ in, uint32_t *__restrict__ out, + const uint32_t bit) { + switch (bit) { + case 0: + uSIMD_nullunpacker32(in, out); + return; -void usimdunpack(const __m128i *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - switch (bit) { - case 0: uSIMD_nullunpacker32(in, out); return; - - case 1: __uSIMD_fastunpack1(in, out); return; - - case 2: __uSIMD_fastunpack2(in, out); return; - - case 3: __uSIMD_fastunpack3(in, out); return; - - case 4: __uSIMD_fastunpack4(in, out); return; + case 1: + __uSIMD_fastunpack1(in, out); + return; - case 5: __uSIMD_fastunpack5(in, out); return; + case 2: + __uSIMD_fastunpack2(in, out); + return; - case 6: __uSIMD_fastunpack6(in, out); return; + case 3: + __uSIMD_fastunpack3(in, out); + return; - case 7: __uSIMD_fastunpack7(in, out); return; + case 4: + __uSIMD_fastunpack4(in, out); + return; - case 8: __uSIMD_fastunpack8(in, out); return; + case 5: + __uSIMD_fastunpack5(in, out); + return; - case 9: __uSIMD_fastunpack9(in, out); return; + case 6: + __uSIMD_fastunpack6(in, out); + return; - case 10: __uSIMD_fastunpack10(in, out); return; + case 7: + __uSIMD_fastunpack7(in, out); + return; - case 11: __uSIMD_fastunpack11(in, out); return; + case 8: + __uSIMD_fastunpack8(in, out); + return; + + case 9: + __uSIMD_fastunpack9(in, out); + return; + + case 10: + __uSIMD_fastunpack10(in, out); + return; + + case 11: + __uSIMD_fastunpack11(in, out); + return; + + case 12: + __uSIMD_fastunpack12(in, out); + return; + + case 13: + __uSIMD_fastunpack13(in, out); + return; + + case 14: + __uSIMD_fastunpack14(in, out); + return; + + case 15: + __uSIMD_fastunpack15(in, out); + return; + + case 16: + __uSIMD_fastunpack16(in, out); + return; + + case 17: + __uSIMD_fastunpack17(in, out); + return; + + case 18: + __uSIMD_fastunpack18(in, out); + return; + + case 19: + __uSIMD_fastunpack19(in, out); + return; + + case 20: + __uSIMD_fastunpack20(in, out); + return; + + case 21: + __uSIMD_fastunpack21(in, out); + return; + + case 22: + __uSIMD_fastunpack22(in, out); + return; + + case 23: + __uSIMD_fastunpack23(in, out); + return; + + case 24: + __uSIMD_fastunpack24(in, out); + return; + + case 25: + __uSIMD_fastunpack25(in, out); + return; - case 12: __uSIMD_fastunpack12(in, out); return; + case 26: + __uSIMD_fastunpack26(in, out); + return; - case 13: __uSIMD_fastunpack13(in, out); return; + case 27: + __uSIMD_fastunpack27(in, out); + return; - case 14: __uSIMD_fastunpack14(in, out); return; + case 28: + __uSIMD_fastunpack28(in, out); + return; - case 15: __uSIMD_fastunpack15(in, out); return; + case 29: + __uSIMD_fastunpack29(in, out); + return; - case 16: __uSIMD_fastunpack16(in, out); return; + case 30: + __uSIMD_fastunpack30(in, out); + return; - case 17: __uSIMD_fastunpack17(in, out); return; + case 31: + __uSIMD_fastunpack31(in, out); + return; - case 18: __uSIMD_fastunpack18(in, out); return; + case 32: + __uSIMD_fastunpack32(in, out); + return; - case 19: __uSIMD_fastunpack19(in, out); return; - - case 20: __uSIMD_fastunpack20(in, out); return; - - case 21: __uSIMD_fastunpack21(in, out); return; - - case 22: __uSIMD_fastunpack22(in, out); return; - - case 23: __uSIMD_fastunpack23(in, out); return; - - case 24: __uSIMD_fastunpack24(in, out); return; - - case 25: __uSIMD_fastunpack25(in, out); return; - - case 26: __uSIMD_fastunpack26(in, out); return; - - case 27: __uSIMD_fastunpack27(in, out); return; - - case 28: __uSIMD_fastunpack28(in, out); return; - - case 29: __uSIMD_fastunpack29(in, out); return; - - case 30: __uSIMD_fastunpack30(in, out); return; - - case 31: __uSIMD_fastunpack31(in, out); return; - - case 32: __uSIMD_fastunpack32(in, out); return; - - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } - - /*assumes that integers fit in the prescribed number of bits*/ -void usimdpackwithoutmask(const uint32_t *__restrict__ in, __m128i *__restrict__ out, const uint32_t bit) { - switch (bit) { - case 0: return; - - case 1: __uSIMD_fastpackwithoutmask1(in, out); return; - - case 2: __uSIMD_fastpackwithoutmask2(in, out); return; - - case 3: __uSIMD_fastpackwithoutmask3(in, out); return; - - case 4: __uSIMD_fastpackwithoutmask4(in, out); return; - - case 5: __uSIMD_fastpackwithoutmask5(in, out); return; - - case 6: __uSIMD_fastpackwithoutmask6(in, out); return; - - case 7: __uSIMD_fastpackwithoutmask7(in, out); return; - - case 8: __uSIMD_fastpackwithoutmask8(in, out); return; - - case 9: __uSIMD_fastpackwithoutmask9(in, out); return; - - case 10: __uSIMD_fastpackwithoutmask10(in, out); return; - - case 11: __uSIMD_fastpackwithoutmask11(in, out); return; - - case 12: __uSIMD_fastpackwithoutmask12(in, out); return; - - case 13: __uSIMD_fastpackwithoutmask13(in, out); return; - - case 14: __uSIMD_fastpackwithoutmask14(in, out); return; +void usimdpackwithoutmask(const uint32_t *__restrict__ in, + __m128i *__restrict__ out, const uint32_t bit) { + switch (bit) { + case 0: + return; - case 15: __uSIMD_fastpackwithoutmask15(in, out); return; + case 1: + __uSIMD_fastpackwithoutmask1(in, out); + return; - case 16: __uSIMD_fastpackwithoutmask16(in, out); return; + case 2: + __uSIMD_fastpackwithoutmask2(in, out); + return; - case 17: __uSIMD_fastpackwithoutmask17(in, out); return; + case 3: + __uSIMD_fastpackwithoutmask3(in, out); + return; - case 18: __uSIMD_fastpackwithoutmask18(in, out); return; + case 4: + __uSIMD_fastpackwithoutmask4(in, out); + return; - case 19: __uSIMD_fastpackwithoutmask19(in, out); return; + case 5: + __uSIMD_fastpackwithoutmask5(in, out); + return; - case 20: __uSIMD_fastpackwithoutmask20(in, out); return; + case 6: + __uSIMD_fastpackwithoutmask6(in, out); + return; - case 21: __uSIMD_fastpackwithoutmask21(in, out); return; + case 7: + __uSIMD_fastpackwithoutmask7(in, out); + return; - case 22: __uSIMD_fastpackwithoutmask22(in, out); return; + case 8: + __uSIMD_fastpackwithoutmask8(in, out); + return; + + case 9: + __uSIMD_fastpackwithoutmask9(in, out); + return; + + case 10: + __uSIMD_fastpackwithoutmask10(in, out); + return; + + case 11: + __uSIMD_fastpackwithoutmask11(in, out); + return; + + case 12: + __uSIMD_fastpackwithoutmask12(in, out); + return; + + case 13: + __uSIMD_fastpackwithoutmask13(in, out); + return; + + case 14: + __uSIMD_fastpackwithoutmask14(in, out); + return; + + case 15: + __uSIMD_fastpackwithoutmask15(in, out); + return; + + case 16: + __uSIMD_fastpackwithoutmask16(in, out); + return; + + case 17: + __uSIMD_fastpackwithoutmask17(in, out); + return; + + case 18: + __uSIMD_fastpackwithoutmask18(in, out); + return; + + case 19: + __uSIMD_fastpackwithoutmask19(in, out); + return; + + case 20: + __uSIMD_fastpackwithoutmask20(in, out); + return; + + case 21: + __uSIMD_fastpackwithoutmask21(in, out); + return; + + case 22: + __uSIMD_fastpackwithoutmask22(in, out); + return; + + case 23: + __uSIMD_fastpackwithoutmask23(in, out); + return; + + case 24: + __uSIMD_fastpackwithoutmask24(in, out); + return; + + case 25: + __uSIMD_fastpackwithoutmask25(in, out); + return; - case 23: __uSIMD_fastpackwithoutmask23(in, out); return; + case 26: + __uSIMD_fastpackwithoutmask26(in, out); + return; - case 24: __uSIMD_fastpackwithoutmask24(in, out); return; + case 27: + __uSIMD_fastpackwithoutmask27(in, out); + return; - case 25: __uSIMD_fastpackwithoutmask25(in, out); return; + case 28: + __uSIMD_fastpackwithoutmask28(in, out); + return; - case 26: __uSIMD_fastpackwithoutmask26(in, out); return; + case 29: + __uSIMD_fastpackwithoutmask29(in, out); + return; - case 27: __uSIMD_fastpackwithoutmask27(in, out); return; + case 30: + __uSIMD_fastpackwithoutmask30(in, out); + return; - case 28: __uSIMD_fastpackwithoutmask28(in, out); return; + case 31: + __uSIMD_fastpackwithoutmask31(in, out); + return; - case 29: __uSIMD_fastpackwithoutmask29(in, out); return; + case 32: + __uSIMD_fastpackwithoutmask32(in, out); + return; - case 30: __uSIMD_fastpackwithoutmask30(in, out); return; - - case 31: __uSIMD_fastpackwithoutmask31(in, out); return; - - case 32: __uSIMD_fastpackwithoutmask32(in, out); return; - - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } -void usimdpack(const uint32_t *__restrict__ in, __m128i *__restrict__ out, const uint32_t bit) { - switch (bit) { - case 0: return; +void usimdpack(const uint32_t *__restrict__ in, __m128i *__restrict__ out, + const uint32_t bit) { + switch (bit) { + case 0: + return; - case 1: __uSIMD_fastpack1(in, out); return; + case 1: + __uSIMD_fastpack1(in, out); + return; - case 2: __uSIMD_fastpack2(in, out); return; + case 2: + __uSIMD_fastpack2(in, out); + return; - case 3: __uSIMD_fastpack3(in, out); return; + case 3: + __uSIMD_fastpack3(in, out); + return; - case 4: __uSIMD_fastpack4(in, out); return; + case 4: + __uSIMD_fastpack4(in, out); + return; - case 5: __uSIMD_fastpack5(in, out); return; + case 5: + __uSIMD_fastpack5(in, out); + return; - case 6: __uSIMD_fastpack6(in, out); return; + case 6: + __uSIMD_fastpack6(in, out); + return; - case 7: __uSIMD_fastpack7(in, out); return; + case 7: + __uSIMD_fastpack7(in, out); + return; - case 8: __uSIMD_fastpack8(in, out); return; + case 8: + __uSIMD_fastpack8(in, out); + return; - case 9: __uSIMD_fastpack9(in, out); return; + case 9: + __uSIMD_fastpack9(in, out); + return; - case 10: __uSIMD_fastpack10(in, out); return; + case 10: + __uSIMD_fastpack10(in, out); + return; - case 11: __uSIMD_fastpack11(in, out); return; + case 11: + __uSIMD_fastpack11(in, out); + return; - case 12: __uSIMD_fastpack12(in, out); return; + case 12: + __uSIMD_fastpack12(in, out); + return; - case 13: __uSIMD_fastpack13(in, out); return; + case 13: + __uSIMD_fastpack13(in, out); + return; - case 14: __uSIMD_fastpack14(in, out); return; + case 14: + __uSIMD_fastpack14(in, out); + return; - case 15: __uSIMD_fastpack15(in, out); return; + case 15: + __uSIMD_fastpack15(in, out); + return; - case 16: __uSIMD_fastpack16(in, out); return; + case 16: + __uSIMD_fastpack16(in, out); + return; - case 17: __uSIMD_fastpack17(in, out); return; + case 17: + __uSIMD_fastpack17(in, out); + return; - case 18: __uSIMD_fastpack18(in, out); return; + case 18: + __uSIMD_fastpack18(in, out); + return; - case 19: __uSIMD_fastpack19(in, out); return; + case 19: + __uSIMD_fastpack19(in, out); + return; - case 20: __uSIMD_fastpack20(in, out); return; + case 20: + __uSIMD_fastpack20(in, out); + return; - case 21: __uSIMD_fastpack21(in, out); return; + case 21: + __uSIMD_fastpack21(in, out); + return; - case 22: __uSIMD_fastpack22(in, out); return; + case 22: + __uSIMD_fastpack22(in, out); + return; - case 23: __uSIMD_fastpack23(in, out); return; + case 23: + __uSIMD_fastpack23(in, out); + return; - case 24: __uSIMD_fastpack24(in, out); return; + case 24: + __uSIMD_fastpack24(in, out); + return; + + case 25: + __uSIMD_fastpack25(in, out); + return; - case 25: __uSIMD_fastpack25(in, out); return; + case 26: + __uSIMD_fastpack26(in, out); + return; - case 26: __uSIMD_fastpack26(in, out); return; + case 27: + __uSIMD_fastpack27(in, out); + return; - case 27: __uSIMD_fastpack27(in, out); return; + case 28: + __uSIMD_fastpack28(in, out); + return; - case 28: __uSIMD_fastpack28(in, out); return; + case 29: + __uSIMD_fastpack29(in, out); + return; - case 29: __uSIMD_fastpack29(in, out); return; + case 30: + __uSIMD_fastpack30(in, out); + return; - case 30: __uSIMD_fastpack30(in, out); return; + case 31: + __uSIMD_fastpack31(in, out); + return; - case 31: __uSIMD_fastpack31(in, out); return; + case 32: + __uSIMD_fastpack32(in, out); + return; - case 32: __uSIMD_fastpack32(in, out); return; - - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } - - - - - namespace ArrayDispatch { typedef void (*unpackingfunction)(const __m128i *, uint32_t *); typedef void (*packingfunction)(const uint32_t *, __m128i *); - -constexpr unpackingfunction unpack[33] = {SIMD_nullunpacker32, __SIMD_fastunpack1, __SIMD_fastunpack2, __SIMD_fastunpack3, __SIMD_fastunpack4, __SIMD_fastunpack5, __SIMD_fastunpack6, __SIMD_fastunpack7, __SIMD_fastunpack8, __SIMD_fastunpack9, __SIMD_fastunpack10, __SIMD_fastunpack11, __SIMD_fastunpack12, __SIMD_fastunpack13, __SIMD_fastunpack14, __SIMD_fastunpack15, __SIMD_fastunpack16, __SIMD_fastunpack17, __SIMD_fastunpack18, __SIMD_fastunpack19, __SIMD_fastunpack20, __SIMD_fastunpack21, __SIMD_fastunpack22, __SIMD_fastunpack23, __SIMD_fastunpack24, __SIMD_fastunpack25, __SIMD_fastunpack26, __SIMD_fastunpack27, __SIMD_fastunpack28, __SIMD_fastunpack29, __SIMD_fastunpack30, __SIMD_fastunpack31, __SIMD_fastunpack32}; +constexpr unpackingfunction unpack[33] = { + SIMD_nullunpacker32, __SIMD_fastunpack1, __SIMD_fastunpack2, + __SIMD_fastunpack3, __SIMD_fastunpack4, __SIMD_fastunpack5, + __SIMD_fastunpack6, __SIMD_fastunpack7, __SIMD_fastunpack8, + __SIMD_fastunpack9, __SIMD_fastunpack10, __SIMD_fastunpack11, + __SIMD_fastunpack12, __SIMD_fastunpack13, __SIMD_fastunpack14, + __SIMD_fastunpack15, __SIMD_fastunpack16, __SIMD_fastunpack17, + __SIMD_fastunpack18, __SIMD_fastunpack19, __SIMD_fastunpack20, + __SIMD_fastunpack21, __SIMD_fastunpack22, __SIMD_fastunpack23, + __SIMD_fastunpack24, __SIMD_fastunpack25, __SIMD_fastunpack26, + __SIMD_fastunpack27, __SIMD_fastunpack28, __SIMD_fastunpack29, + __SIMD_fastunpack30, __SIMD_fastunpack31, __SIMD_fastunpack32}; ALWAYS_INLINE -void SIMDunpack(const __m128i *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - return unpack[bit](in, out); +void SIMDunpack(const __m128i *__restrict__ in, uint32_t *__restrict__ out, + const uint32_t bit) { + return unpack[bit](in, out); } -constexpr packingfunction packwithoutmask[33] = {__SIMD_fastpackwithoutmask0, __SIMD_fastpackwithoutmask1, __SIMD_fastpackwithoutmask2, __SIMD_fastpackwithoutmask3, __SIMD_fastpackwithoutmask4, __SIMD_fastpackwithoutmask5, __SIMD_fastpackwithoutmask6, __SIMD_fastpackwithoutmask7, __SIMD_fastpackwithoutmask8, __SIMD_fastpackwithoutmask9, __SIMD_fastpackwithoutmask10, __SIMD_fastpackwithoutmask11, __SIMD_fastpackwithoutmask12, __SIMD_fastpackwithoutmask13, __SIMD_fastpackwithoutmask14, __SIMD_fastpackwithoutmask15, __SIMD_fastpackwithoutmask16, __SIMD_fastpackwithoutmask17, __SIMD_fastpackwithoutmask18, __SIMD_fastpackwithoutmask19, __SIMD_fastpackwithoutmask20, __SIMD_fastpackwithoutmask21, __SIMD_fastpackwithoutmask22, __SIMD_fastpackwithoutmask23, __SIMD_fastpackwithoutmask24, __SIMD_fastpackwithoutmask25, __SIMD_fastpackwithoutmask26, __SIMD_fastpackwithoutmask27, __SIMD_fastpackwithoutmask28, __SIMD_fastpackwithoutmask29, __SIMD_fastpackwithoutmask30, __SIMD_fastpackwithoutmask31, __SIMD_fastpackwithoutmask32}; +constexpr packingfunction packwithoutmask[33] = { + __SIMD_fastpackwithoutmask0, __SIMD_fastpackwithoutmask1, + __SIMD_fastpackwithoutmask2, __SIMD_fastpackwithoutmask3, + __SIMD_fastpackwithoutmask4, __SIMD_fastpackwithoutmask5, + __SIMD_fastpackwithoutmask6, __SIMD_fastpackwithoutmask7, + __SIMD_fastpackwithoutmask8, __SIMD_fastpackwithoutmask9, + __SIMD_fastpackwithoutmask10, __SIMD_fastpackwithoutmask11, + __SIMD_fastpackwithoutmask12, __SIMD_fastpackwithoutmask13, + __SIMD_fastpackwithoutmask14, __SIMD_fastpackwithoutmask15, + __SIMD_fastpackwithoutmask16, __SIMD_fastpackwithoutmask17, + __SIMD_fastpackwithoutmask18, __SIMD_fastpackwithoutmask19, + __SIMD_fastpackwithoutmask20, __SIMD_fastpackwithoutmask21, + __SIMD_fastpackwithoutmask22, __SIMD_fastpackwithoutmask23, + __SIMD_fastpackwithoutmask24, __SIMD_fastpackwithoutmask25, + __SIMD_fastpackwithoutmask26, __SIMD_fastpackwithoutmask27, + __SIMD_fastpackwithoutmask28, __SIMD_fastpackwithoutmask29, + __SIMD_fastpackwithoutmask30, __SIMD_fastpackwithoutmask31, + __SIMD_fastpackwithoutmask32}; ALWAYS_INLINE -void SIMDpackwithoutmask(const uint32_t *__restrict__ in, __m128i *__restrict__ out, const uint32_t bit) { - packwithoutmask[bit](in, out); +void SIMDpackwithoutmask(const uint32_t *__restrict__ in, + __m128i *__restrict__ out, const uint32_t bit) { + packwithoutmask[bit](in, out); } -constexpr packingfunction pack[33] = {__SIMD_fastpack0, __SIMD_fastpack1, __SIMD_fastpack2, __SIMD_fastpack3, __SIMD_fastpack4, __SIMD_fastpack5, __SIMD_fastpack6, __SIMD_fastpack7, __SIMD_fastpack8, __SIMD_fastpack9, __SIMD_fastpack10, __SIMD_fastpack11, __SIMD_fastpack12, __SIMD_fastpack13, __SIMD_fastpack14, __SIMD_fastpack15, __SIMD_fastpack16, __SIMD_fastpack17, __SIMD_fastpack18, __SIMD_fastpack19, __SIMD_fastpack20, __SIMD_fastpack21, __SIMD_fastpack22, __SIMD_fastpack23, __SIMD_fastpack24, __SIMD_fastpack25, __SIMD_fastpack26, __SIMD_fastpack27, __SIMD_fastpack28, __SIMD_fastpack29, __SIMD_fastpack30, __SIMD_fastpack31, __SIMD_fastpack32}; +constexpr packingfunction pack[33] = { + __SIMD_fastpack0, __SIMD_fastpack1, __SIMD_fastpack2, __SIMD_fastpack3, + __SIMD_fastpack4, __SIMD_fastpack5, __SIMD_fastpack6, __SIMD_fastpack7, + __SIMD_fastpack8, __SIMD_fastpack9, __SIMD_fastpack10, __SIMD_fastpack11, + __SIMD_fastpack12, __SIMD_fastpack13, __SIMD_fastpack14, __SIMD_fastpack15, + __SIMD_fastpack16, __SIMD_fastpack17, __SIMD_fastpack18, __SIMD_fastpack19, + __SIMD_fastpack20, __SIMD_fastpack21, __SIMD_fastpack22, __SIMD_fastpack23, + __SIMD_fastpack24, __SIMD_fastpack25, __SIMD_fastpack26, __SIMD_fastpack27, + __SIMD_fastpack28, __SIMD_fastpack29, __SIMD_fastpack30, __SIMD_fastpack31, + __SIMD_fastpack32}; ALWAYS_INLINE -void SIMDpack(const uint32_t *__restrict__ in, __m128i *__restrict__ out, const uint32_t bit) { - pack[bit](in, out); +void SIMDpack(const uint32_t *__restrict__ in, __m128i *__restrict__ out, + const uint32_t bit) { + pack[bit](in, out); } - - - -constexpr unpackingfunction Uunpack[33] = {uSIMD_nullunpacker32, __uSIMD_fastunpack1, __uSIMD_fastunpack2, __uSIMD_fastunpack3, __uSIMD_fastunpack4, __uSIMD_fastunpack5, __uSIMD_fastunpack6, __uSIMD_fastunpack7, __uSIMD_fastunpack8, __uSIMD_fastunpack9, __uSIMD_fastunpack10, __uSIMD_fastunpack11, __uSIMD_fastunpack12, __uSIMD_fastunpack13, __uSIMD_fastunpack14, __uSIMD_fastunpack15, __uSIMD_fastunpack16, __uSIMD_fastunpack17, __uSIMD_fastunpack18, __uSIMD_fastunpack19, __uSIMD_fastunpack20, __uSIMD_fastunpack21, __uSIMD_fastunpack22, __uSIMD_fastunpack23, __uSIMD_fastunpack24, __uSIMD_fastunpack25, __uSIMD_fastunpack26, __uSIMD_fastunpack27, __uSIMD_fastunpack28, __uSIMD_fastunpack29, __uSIMD_fastunpack30, __uSIMD_fastunpack31, __uSIMD_fastunpack32}; +constexpr unpackingfunction Uunpack[33] = { + uSIMD_nullunpacker32, __uSIMD_fastunpack1, __uSIMD_fastunpack2, + __uSIMD_fastunpack3, __uSIMD_fastunpack4, __uSIMD_fastunpack5, + __uSIMD_fastunpack6, __uSIMD_fastunpack7, __uSIMD_fastunpack8, + __uSIMD_fastunpack9, __uSIMD_fastunpack10, __uSIMD_fastunpack11, + __uSIMD_fastunpack12, __uSIMD_fastunpack13, __uSIMD_fastunpack14, + __uSIMD_fastunpack15, __uSIMD_fastunpack16, __uSIMD_fastunpack17, + __uSIMD_fastunpack18, __uSIMD_fastunpack19, __uSIMD_fastunpack20, + __uSIMD_fastunpack21, __uSIMD_fastunpack22, __uSIMD_fastunpack23, + __uSIMD_fastunpack24, __uSIMD_fastunpack25, __uSIMD_fastunpack26, + __uSIMD_fastunpack27, __uSIMD_fastunpack28, __uSIMD_fastunpack29, + __uSIMD_fastunpack30, __uSIMD_fastunpack31, __uSIMD_fastunpack32}; ALWAYS_INLINE -void uSIMDunpack(const __m128i *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - return Uunpack[bit](in, out); +void uSIMDunpack(const __m128i *__restrict__ in, uint32_t *__restrict__ out, + const uint32_t bit) { + return Uunpack[bit](in, out); } -constexpr packingfunction Upackwithoutmask[33] = {__uSIMD_fastpackwithoutmask0, __uSIMD_fastpackwithoutmask1, __uSIMD_fastpackwithoutmask2, __uSIMD_fastpackwithoutmask3, __uSIMD_fastpackwithoutmask4, __uSIMD_fastpackwithoutmask5, __uSIMD_fastpackwithoutmask6, __uSIMD_fastpackwithoutmask7, __uSIMD_fastpackwithoutmask8, __uSIMD_fastpackwithoutmask9, __uSIMD_fastpackwithoutmask10, __uSIMD_fastpackwithoutmask11, __uSIMD_fastpackwithoutmask12, __uSIMD_fastpackwithoutmask13, __uSIMD_fastpackwithoutmask14, __uSIMD_fastpackwithoutmask15, __uSIMD_fastpackwithoutmask16, __uSIMD_fastpackwithoutmask17, __uSIMD_fastpackwithoutmask18, __uSIMD_fastpackwithoutmask19, __uSIMD_fastpackwithoutmask20, __uSIMD_fastpackwithoutmask21, __uSIMD_fastpackwithoutmask22, __uSIMD_fastpackwithoutmask23, __uSIMD_fastpackwithoutmask24, __uSIMD_fastpackwithoutmask25, __uSIMD_fastpackwithoutmask26, __uSIMD_fastpackwithoutmask27, __uSIMD_fastpackwithoutmask28, __uSIMD_fastpackwithoutmask29, __uSIMD_fastpackwithoutmask30, __uSIMD_fastpackwithoutmask31, __uSIMD_fastpackwithoutmask32}; +constexpr packingfunction Upackwithoutmask[33] = { + __uSIMD_fastpackwithoutmask0, __uSIMD_fastpackwithoutmask1, + __uSIMD_fastpackwithoutmask2, __uSIMD_fastpackwithoutmask3, + __uSIMD_fastpackwithoutmask4, __uSIMD_fastpackwithoutmask5, + __uSIMD_fastpackwithoutmask6, __uSIMD_fastpackwithoutmask7, + __uSIMD_fastpackwithoutmask8, __uSIMD_fastpackwithoutmask9, + __uSIMD_fastpackwithoutmask10, __uSIMD_fastpackwithoutmask11, + __uSIMD_fastpackwithoutmask12, __uSIMD_fastpackwithoutmask13, + __uSIMD_fastpackwithoutmask14, __uSIMD_fastpackwithoutmask15, + __uSIMD_fastpackwithoutmask16, __uSIMD_fastpackwithoutmask17, + __uSIMD_fastpackwithoutmask18, __uSIMD_fastpackwithoutmask19, + __uSIMD_fastpackwithoutmask20, __uSIMD_fastpackwithoutmask21, + __uSIMD_fastpackwithoutmask22, __uSIMD_fastpackwithoutmask23, + __uSIMD_fastpackwithoutmask24, __uSIMD_fastpackwithoutmask25, + __uSIMD_fastpackwithoutmask26, __uSIMD_fastpackwithoutmask27, + __uSIMD_fastpackwithoutmask28, __uSIMD_fastpackwithoutmask29, + __uSIMD_fastpackwithoutmask30, __uSIMD_fastpackwithoutmask31, + __uSIMD_fastpackwithoutmask32}; ALWAYS_INLINE -void uSIMDpackwithoutmask(const uint32_t *__restrict__ in, __m128i *__restrict__ out, const uint32_t bit) { - Upackwithoutmask[bit](in, out); +void uSIMDpackwithoutmask(const uint32_t *__restrict__ in, + __m128i *__restrict__ out, const uint32_t bit) { + Upackwithoutmask[bit](in, out); } -constexpr packingfunction Upack[33] = {__uSIMD_fastpack0, __uSIMD_fastpack1, __uSIMD_fastpack2, __uSIMD_fastpack3, __uSIMD_fastpack4, __uSIMD_fastpack5, __uSIMD_fastpack6, __uSIMD_fastpack7, __uSIMD_fastpack8, __uSIMD_fastpack9, __uSIMD_fastpack10, __uSIMD_fastpack11, __uSIMD_fastpack12, __uSIMD_fastpack13, __uSIMD_fastpack14, __uSIMD_fastpack15, __uSIMD_fastpack16, __uSIMD_fastpack17, __uSIMD_fastpack18, __uSIMD_fastpack19, __uSIMD_fastpack20, __uSIMD_fastpack21, __uSIMD_fastpack22, __uSIMD_fastpack23, __uSIMD_fastpack24, __uSIMD_fastpack25, __uSIMD_fastpack26, __uSIMD_fastpack27, __uSIMD_fastpack28, __uSIMD_fastpack29, __uSIMD_fastpack30, __uSIMD_fastpack31, __uSIMD_fastpack32}; +constexpr packingfunction Upack[33] = { + __uSIMD_fastpack0, __uSIMD_fastpack1, __uSIMD_fastpack2, + __uSIMD_fastpack3, __uSIMD_fastpack4, __uSIMD_fastpack5, + __uSIMD_fastpack6, __uSIMD_fastpack7, __uSIMD_fastpack8, + __uSIMD_fastpack9, __uSIMD_fastpack10, __uSIMD_fastpack11, + __uSIMD_fastpack12, __uSIMD_fastpack13, __uSIMD_fastpack14, + __uSIMD_fastpack15, __uSIMD_fastpack16, __uSIMD_fastpack17, + __uSIMD_fastpack18, __uSIMD_fastpack19, __uSIMD_fastpack20, + __uSIMD_fastpack21, __uSIMD_fastpack22, __uSIMD_fastpack23, + __uSIMD_fastpack24, __uSIMD_fastpack25, __uSIMD_fastpack26, + __uSIMD_fastpack27, __uSIMD_fastpack28, __uSIMD_fastpack29, + __uSIMD_fastpack30, __uSIMD_fastpack31, __uSIMD_fastpack32}; ALWAYS_INLINE -void uSIMDpack(const uint32_t *__restrict__ in, __m128i *__restrict__ out, const uint32_t bit) { - Upack[bit](in, out); +void uSIMDpack(const uint32_t *__restrict__ in, __m128i *__restrict__ out, + const uint32_t bit) { + Upack[bit](in, out); } - } +template struct SIMDBitPackingHelpers { -template -struct SIMDBitPackingHelpers { - - static void pack(uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - if (SIMDBlockSize % 32) { - throw std::logic_error("Incorrect SIMDBlockSize."); - } - __m128i initoffset = _mm_set1_epi32(0); - - for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { - __m128i nextoffset = MM_LOAD_SI_128(reinterpret_cast<__m128i *>((in + k * SIMDBlockSize + SIMDBlockSize - 4))); - - if (bit < 32) SIMDDeltaProcessor::runDelta(initoffset, in + k * SIMDBlockSize); - simdpack(in + k * SIMDBlockSize, reinterpret_cast<__m128i *>(out + SIMDBlockSize * k * bit / 32), bit); - initoffset = nextoffset; - } + static void pack(uint32_t *in, const size_t Qty, uint32_t *out, + const uint32_t bit) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); } + if (SIMDBlockSize % 32) { + throw std::logic_error("Incorrect SIMDBlockSize."); + } + __m128i initoffset = _mm_set1_epi32(0); + + for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { + __m128i nextoffset = MM_LOAD_SI_128(reinterpret_cast<__m128i *>( + (in + k * SIMDBlockSize + SIMDBlockSize - 4))); + + if (bit < 32) + SIMDDeltaProcessor::runDelta( + initoffset, in + k * SIMDBlockSize); + simdpack(in + k * SIMDBlockSize, + reinterpret_cast<__m128i *>(out + SIMDBlockSize * k * bit / 32), + bit); + initoffset = nextoffset; + } + } - - static void unpack(const uint32_t *in, size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - __m128i initoffset = _mm_set1_epi32(0); - - for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { - simdunpack(reinterpret_cast(in + SIMDBlockSize * k * bit / 32), out + k * SIMDBlockSize, bit); - if (bit < 32) { - initoffset = SIMDDeltaProcessor::runPrefixSum(initoffset, out + k * SIMDBlockSize); - } - } + static void unpack(const uint32_t *in, size_t Qty, uint32_t *out, + const uint32_t bit) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + __m128i initoffset = _mm_set1_epi32(0); + + for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { + simdunpack( + reinterpret_cast(in + SIMDBlockSize * k * bit / 32), + out + k * SIMDBlockSize, bit); + if (bit < 32) { + initoffset = + SIMDDeltaProcessor::runPrefixSum( + initoffset, out + k * SIMDBlockSize); + } } + } - static void packwithoutmask(uint32_t *in, const size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - __m128i initoffset = _mm_set1_epi32(0); - - for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { - __m128i nextoffset = MM_LOAD_SI_128(reinterpret_cast<__m128i *>((in + k * SIMDBlockSize + SIMDBlockSize - 4))); - if (bit < 32) SIMDDeltaProcessor::runDelta(initoffset, in + k * SIMDBlockSize); - simdpackwithoutmask(in + k * SIMDBlockSize, reinterpret_cast<__m128i *>(out + SIMDBlockSize * k * bit / 32), bit); - initoffset = nextoffset; - } + static void packwithoutmask(uint32_t *in, const size_t Qty, uint32_t *out, + const uint32_t bit) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); } + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + __m128i initoffset = _mm_set1_epi32(0); + + for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { + __m128i nextoffset = MM_LOAD_SI_128(reinterpret_cast<__m128i *>( + (in + k * SIMDBlockSize + SIMDBlockSize - 4))); + if (bit < 32) + SIMDDeltaProcessor::runDelta( + initoffset, in + k * SIMDBlockSize); + simdpackwithoutmask( + in + k * SIMDBlockSize, + reinterpret_cast<__m128i *>(out + SIMDBlockSize * k * bit / 32), bit); + initoffset = nextoffset; + } + } - static void ipack(const uint32_t *in, const size_t Qty, uint32_t *_out, const uint32_t bit) { - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i initoffset = _mm_set1_epi32(0U);; - - for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { - SIMDipack(initoffset, in + k * SIMDBlockSize, out + k * bit, bit); - initoffset = MM_LOAD_SI_128(reinterpret_cast(in + k * SIMDBlockSize + SIMDBlockSize - 4)); - //memcpy(&initoffset, (in+k*SIMDBlockSize+SIMDBlockSize - 4), sizeof initoffset);// Daniel: memcpy looks like a hack - } + static void ipack(const uint32_t *in, const size_t Qty, uint32_t *_out, + const uint32_t bit) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); } + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i initoffset = _mm_set1_epi32(0U); + ; + + for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { + SIMDipack(initoffset, in + k * SIMDBlockSize, out + k * bit, + bit); + initoffset = MM_LOAD_SI_128(reinterpret_cast( + in + k * SIMDBlockSize + SIMDBlockSize - 4)); + // memcpy(&initoffset, (in+k*SIMDBlockSize+SIMDBlockSize - 4), sizeof + // initoffset);// Daniel: memcpy looks like a hack + } + } - static void ipackwithoutmask(const uint32_t *in, const size_t Qty, uint32_t *_out, const uint32_t bit) { - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i initoffset = _mm_set1_epi32(0U);; - - for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { - SIMDipackwithoutmask(initoffset, in + k * SIMDBlockSize, out + k * bit, bit); - initoffset = MM_LOAD_SI_128(reinterpret_cast(in + k * SIMDBlockSize + SIMDBlockSize - 4)); - //memcpy(&initoffset, (in+k*SIMDBlockSize+SIMDBlockSize - 4), sizeof initoffset);// Daniel: memcpy looks like a hack - } + static void ipackwithoutmask(const uint32_t *in, const size_t Qty, + uint32_t *_out, const uint32_t bit) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i initoffset = _mm_set1_epi32(0U); + ; + + for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { + SIMDipackwithoutmask(initoffset, in + k * SIMDBlockSize, + out + k * bit, bit); + initoffset = MM_LOAD_SI_128(reinterpret_cast( + in + k * SIMDBlockSize + SIMDBlockSize - 4)); + // memcpy(&initoffset, (in+k*SIMDBlockSize+SIMDBlockSize - 4), sizeof + // initoffset);// Daniel: memcpy looks like a hack } + } - static void iunpack(const uint32_t *_in, size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - const __m128i *in = reinterpret_cast(_in); + static void iunpack(const uint32_t *_in, size_t Qty, uint32_t *out, + const uint32_t bit) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + const __m128i *in = reinterpret_cast(_in); - __m128i initoffset = _mm_set1_epi32(0U);; + __m128i initoffset = _mm_set1_epi32(0U); + ; - for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { - initoffset = SIMDiunpack(initoffset, in + k * bit, out + k * SIMDBlockSize, bit); - } + for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { + initoffset = SIMDiunpack(initoffset, in + k * bit, + out + k * SIMDBlockSize, bit); } + } - // this is not expected to be useful, only for benchmarking - static void ipatchedunpack(const uint32_t *_in, size_t Qty, uint32_t *out, const uint32_t bit) { - if (Qty % SIMDBlockSize) { - throw std::logic_error("Incorrect # of entries."); - } - const __m128i *in = reinterpret_cast(_in); + // this is not expected to be useful, only for benchmarking + static void ipatchedunpack(const uint32_t *_in, size_t Qty, uint32_t *out, + const uint32_t bit) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + const __m128i *in = reinterpret_cast(_in); - __m128i initoffset = _mm_set1_epi32(0U);; + __m128i initoffset = _mm_set1_epi32(0U); + ; - for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { - initoffset = SIMDipatchedunpack(initoffset, in + k * bit, out + k * SIMDBlockSize, - reinterpret_cast(out + k * SIMDBlockSize), bit); - } + for (size_t k = 0; k < Qty / SIMDBlockSize; ++k) { + initoffset = SIMDipatchedunpack( + initoffset, in + k * bit, out + k * SIMDBlockSize, + reinterpret_cast(out + k * SIMDBlockSize), bit); } + } - - static void CheckMaxDiff(const std::vector &refdata, unsigned bit) { - for (size_t i = 4; i < refdata.size(); ++i) { - if (gccbits(refdata[i] - refdata[i - 4]) > bit) throw std::runtime_error("bug"); - } + static void CheckMaxDiff(const std::vector &refdata, unsigned bit) { + for (size_t i = 4; i < refdata.size(); ++i) { + if (gccbits(refdata[i] - refdata[i - 4]) > bit) + throw std::runtime_error("bug"); } + } }; } // namespace SIMDCompressionLib diff --git a/include/simdfastpfor.h b/include/simdfastpfor.h index 162ce7f..beb6fbd 100644 --- a/include/simdfastpfor.h +++ b/include/simdfastpfor.h @@ -16,262 +16,257 @@ namespace SIMDCompressionLib { - - /** * SIMDFastPFor * * Reference and documentation: * - * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization + * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second + * through vectorization * http://arxiv.org/abs/1209.2137 * - * Note that this implementation is slightly improved compared to the version presented + * Note that this implementation is slightly improved compared to the version + * presented * in the paper. * - * Designed by D. Lemire with ideas from Leonid Boytsov. This scheme is NOT patented. + * Designed by D. Lemire with ideas from Leonid Boytsov. This scheme is NOT + * patented. * */ -template -class SIMDFastPFor: public IntegerCODEC { +template +class SIMDFastPFor : public IntegerCODEC { public: - /** - * ps (page size) should be a multiple of BlockSize, any "large" - * value should do. - */ - SIMDFastPFor(uint32_t ps = 65536) : - PageSize(ps), bitsPageSize(gccbits(PageSize)), bpacker(), + /** + * ps (page size) should be a multiple of BlockSize, any "large" + * value should do. + */ + SIMDFastPFor(uint32_t ps = 65536) + : PageSize(ps), bitsPageSize(gccbits(PageSize)), bpacker(), bytescontainer(PageSize + 3 * PageSize / BlockSize) { - assert(ps / BlockSize * BlockSize == ps); - assert(gccbits(BlockSizeInUnitsOfPackSize * PACKSIZE - 1) <= 8); - } - enum { - PACKSIZE = 32, - overheadofeachexcept = 8, - overheadduetobits = 8, - overheadduetonmbrexcept = 8, - BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE - }; + assert(ps / BlockSize * BlockSize == ps); + assert(gccbits(BlockSizeInUnitsOfPackSize * PACKSIZE - 1) <= 8); + } + enum { + PACKSIZE = 32, + overheadofeachexcept = 8, + overheadduetobits = 8, + overheadduetonmbrexcept = 8, + BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE + }; + const uint32_t PageSize; + const uint32_t bitsPageSize; + SortedBitPacker bpacker; + vector bytescontainer; - - const uint32_t PageSize; - const uint32_t bitsPageSize; - SortedBitPacker bpacker; - vector bytescontainer; - - const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) { + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { #ifdef USE_ALIGNED - if (needPaddingTo128Bits(out) - or needPaddingTo128Bits(in)) throw - std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + if (needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) + throw std::runtime_error( + "alignment issue: pointers should be aligned on 128-bit boundaries"); #endif - const uint32_t *const initin(in); - const size_t mynvalue = *in; - ++in; - if (mynvalue > nvalue) - throw NotEnoughStorage(mynvalue); - nvalue = mynvalue; - const uint32_t *const finalout(out + nvalue); - __m128i prev = _mm_set1_epi32(0); - while (out != finalout) { - size_t thisnvalue(0); - size_t thissize = - static_cast(finalout > PageSize + out ? PageSize - : (finalout - out)); + const uint32_t *const initin(in); + const size_t mynvalue = *in; + ++in; + if (mynvalue > nvalue) + throw NotEnoughStorage(mynvalue); + nvalue = mynvalue; + const uint32_t *const finalout(out + nvalue); + __m128i prev = _mm_set1_epi32(0); + while (out != finalout) { + size_t thisnvalue(0); + size_t thissize = static_cast( + finalout > PageSize + out ? PageSize : (finalout - out)); - __decodeArray(in, thisnvalue, out, thissize, prev); - in += thisnvalue; - out += thissize; - } - assert(initin + length >= in); - bpacker.reset();// if you don't do this, the codec has a "memory". - return in; + __decodeArray(in, thisnvalue, out, thissize, prev); + in += thisnvalue; + out += thissize; } + assert(initin + length >= in); + bpacker.reset(); // if you don't do this, the codec has a "memory". + return in; + } - /** - * If you save the output and recover it in memory, you are - * responsible to ensure that the alignment is preserved. - * - * The input size (length) should be a multiple of - * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done - * to simplify slightly the implementation.) - */ - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { + /** + * If you save the output and recover it in memory, you are + * responsible to ensure that the alignment is preserved. + * + * The input size (length) should be a multiple of + * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done + * to simplify slightly the implementation.) + */ + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { #ifdef USE_ALIGNED - if (needPaddingTo128Bits(out) - or needPaddingTo128Bits(in)) throw - std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + if (needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) + throw std::runtime_error( + "alignment issue: pointers should be aligned on 128-bit boundaries"); #endif - checkifdivisibleby(length, BlockSize); - const uint32_t *const initout(out); - const uint32_t *const finalin(in + length); + checkifdivisibleby(length, BlockSize); + const uint32_t *const initout(out); + const uint32_t *const finalin(in + length); - *out++ = static_cast(length); - const size_t oldnvalue = nvalue; - nvalue = 1; - __m128i prev = _mm_set1_epi32(0); - while (in != finalin) { - size_t thissize = - static_cast(finalin > PageSize + in ? PageSize - : (finalin - in)); - size_t thisnvalue(0); - __encodeArray(in, thissize, out, thisnvalue, prev); - nvalue += thisnvalue; - out += thisnvalue; - in += thissize; - } - assert(out == nvalue + initout); - if (oldnvalue < nvalue) - cerr << "It is possible we have a buffer overrun. " << endl; - bpacker.reset();// if you don't do this, the buffer has a memory + *out++ = static_cast(length); + const size_t oldnvalue = nvalue; + nvalue = 1; + __m128i prev = _mm_set1_epi32(0); + while (in != finalin) { + size_t thissize = static_cast( + finalin > PageSize + in ? PageSize : (finalin - in)); + size_t thisnvalue(0); + __encodeArray(in, thissize, out, thisnvalue, prev); + nvalue += thisnvalue; + out += thisnvalue; + in += thissize; } + assert(out == nvalue + initout); + if (oldnvalue < nvalue) + cerr << "It is possible we have a buffer overrun. " << endl; + bpacker.reset(); // if you don't do this, the buffer has a memory + } - - void getBestBFromData(const uint32_t *in, uint8_t &bestb, - uint8_t &bestcexcept, uint8_t &maxb) { - uint32_t freqs[33]; - for (uint32_t k = 0; k <= 32; ++k) - freqs[k] = 0; - for (uint32_t k = 0; k < BlockSize; ++k) { - freqs[asmbits(in[k])]++; - } - bestb = 32; - while (freqs[bestb] == 0) - bestb--; - maxb = bestb; - uint32_t bestcost = bestb * BlockSize; - uint32_t cexcept = 0; + void getBestBFromData(const uint32_t *in, uint8_t &bestb, + uint8_t &bestcexcept, uint8_t &maxb) { + uint32_t freqs[33]; + for (uint32_t k = 0; k <= 32; ++k) + freqs[k] = 0; + for (uint32_t k = 0; k < BlockSize; ++k) { + freqs[asmbits(in[k])]++; + } + bestb = 32; + while (freqs[bestb] == 0) + bestb--; + maxb = bestb; + uint32_t bestcost = bestb * BlockSize; + uint32_t cexcept = 0; + bestcexcept = static_cast(cexcept); + for (uint32_t b = bestb - 1; b < 32; --b) { + cexcept += freqs[b + 1]; + uint32_t thiscost = cexcept * overheadofeachexcept + + cexcept * (maxb - b) + b * BlockSize + + 8; // the extra 8 is the cost of storing maxbits + if (bestb - b == 1) + thiscost -= cexcept; + if (thiscost < bestcost) { + bestcost = thiscost; + bestb = static_cast(b); bestcexcept = static_cast(cexcept); - for (uint32_t b = bestb - 1; b < 32; --b) { - cexcept += freqs[b + 1]; - uint32_t thiscost = cexcept * overheadofeachexcept + cexcept - * (maxb - b) + b * BlockSize + 8;// the extra 8 is the cost of storing maxbits - if(bestb - b == 1) thiscost -= cexcept; - if (thiscost < bestcost) { - bestcost = thiscost; - bestb = static_cast(b); - bestcexcept = static_cast(cexcept); - } - } + } } + } - void __encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue, __m128i &prev) { // = _mm_set1_epi32 (0);// for delta - uint32_t *const initout = out; // keep track of this - checkifdivisibleby(length, BlockSize); - uint32_t *const headerout = out++; // keep track of this - bpacker.clear(); - uint8_t *bc = bytescontainer.data(); + void + __encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue, + __m128i &prev) { // = _mm_set1_epi32 (0);// for delta + uint32_t *const initout = out; // keep track of this + checkifdivisibleby(length, BlockSize); + uint32_t *const headerout = out++; // keep track of this + bpacker.clear(); + uint8_t *bc = bytescontainer.data(); #ifdef USE_ALIGNED - out = padTo128bits(out); - if (needPaddingTo128Bits(in)) throw std::runtime_error("alignment bug"); + out = padTo128bits(out); + if (needPaddingTo128Bits(in)) + throw std::runtime_error("alignment bug"); #endif - for (const uint32_t *const final = in + length; (in + BlockSize - <= final); in += BlockSize) { - uint8_t bestb, bestcexcept, maxb; + for (const uint32_t *const final = in + length; (in + BlockSize <= final); + in += BlockSize) { + uint8_t bestb, bestcexcept, maxb; - const __m128i nextprev = MM_LOAD_SI_128(reinterpret_cast(in + BlockSize - 4)); - SIMDDeltaProcessor::runDelta(prev, in); - prev = nextprev; + const __m128i nextprev = + MM_LOAD_SI_128(reinterpret_cast(in + BlockSize - 4)); + SIMDDeltaProcessor::runDelta(prev, in); + prev = nextprev; - getBestBFromData(in, bestb, bestcexcept, maxb); - *bc++ = bestb; - *bc++ = bestcexcept; - if (bestcexcept > 0) { - *bc++ = maxb; - bpacker.ensureCapacity(maxb - bestb - 1, bestcexcept); - const uint32_t maxval = 1U << bestb; - for (uint32_t k = 0; k < BlockSize; ++k) { - if (in[k] >= maxval) { - bpacker.directAppend(maxb - bestb - 1, in[k] >> bestb); - *bc++ = static_cast(k); - } - } - } - for(int k = 0; k(out), bestb); - out += 4 * bestb; - } + getBestBFromData(in, bestb, bestcexcept, maxb); + *bc++ = bestb; + *bc++ = bestcexcept; + if (bestcexcept > 0) { + *bc++ = maxb; + bpacker.ensureCapacity(maxb - bestb - 1, bestcexcept); + const uint32_t maxval = 1U << bestb; + for (uint32_t k = 0; k < BlockSize; ++k) { + if (in[k] >= maxval) { + bpacker.directAppend(maxb - bestb - 1, in[k] >> bestb); + *bc++ = static_cast(k); + } } - headerout[0] = static_cast(out - headerout); - const uint32_t bytescontainersize = static_cast(bc - bytescontainer.data()); - *(out++) = bytescontainersize; - memcpy(out, bytescontainer.data(), bytescontainersize); - out += (bytescontainersize + sizeof(uint32_t) - 1) - / sizeof(uint32_t); - const uint32_t *const lastout = bpacker.write(out); - nvalue = lastout - initout; + } + for (int k = 0; k < BlockSize; k += 128) { + simdpack(in + k, reinterpret_cast<__m128i *>(out), bestb); + out += 4 * bestb; + } } + headerout[0] = static_cast(out - headerout); + const uint32_t bytescontainersize = + static_cast(bc - bytescontainer.data()); + *(out++) = bytescontainersize; + memcpy(out, bytescontainer.data(), bytescontainersize); + out += (bytescontainersize + sizeof(uint32_t) - 1) / sizeof(uint32_t); + const uint32_t *const lastout = bpacker.write(out); + nvalue = lastout - initout; + } - void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out, - const size_t nvalue, __m128i &prev) { - const uint32_t *const initin = in; - const uint32_t *const headerin = in++; - const uint32_t wheremeta = headerin[0]; - const uint32_t *inexcept = headerin + wheremeta; - const uint32_t bytesize = *inexcept++; - const uint8_t *bytep = reinterpret_cast(inexcept); + void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out, + const size_t nvalue, __m128i &prev) { + const uint32_t *const initin = in; + const uint32_t *const headerin = in++; + const uint32_t wheremeta = headerin[0]; + const uint32_t *inexcept = headerin + wheremeta; + const uint32_t bytesize = *inexcept++; + const uint8_t *bytep = reinterpret_cast(inexcept); - inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t); - inexcept = bpacker.read(inexcept); - length = inexcept - initin; - const uint32_t *unpackpointers[32 + 1]; - for (uint32_t k = 1; k <= 32; ++k) { - unpackpointers[k] = bpacker.get(k - 1); - } + inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t); + inexcept = bpacker.read(inexcept); + length = inexcept - initin; + const uint32_t *unpackpointers[32 + 1]; + for (uint32_t k = 1; k <= 32; ++k) { + unpackpointers[k] = bpacker.get(k - 1); + } #ifdef USE_ALIGNED - in = padTo128bits(in); - assert(!needPaddingTo128Bits(out)); + in = padTo128bits(in); + assert(!needPaddingTo128Bits(out)); #endif - for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out - += BlockSize) { - const uint8_t b = *bytep++; - const uint8_t cexcept = *bytep++; - for(int k = 0; k(in), out + k, b); - else - ArrayDispatch::SIMDunpack(reinterpret_cast(in), out + k, b); - in += 4 * b; - } - if (cexcept > 0) { - const uint8_t maxbits = *bytep++; - if(maxbits - b == 1) { - for (uint32_t k = 0; k < cexcept; ++k) { - const uint8_t pos = *(bytep++); - out[pos] |= static_cast(1) << b; - } - } else { - const uint32_t *vals = unpackpointers[maxbits - b]; - unpackpointers[maxbits - b] += cexcept; - for (uint32_t k = 0; k < cexcept; ++k) { - const uint8_t pos = *(bytep++); - out[pos] |= vals[k] << b; - } - } - } - prev = SIMDDeltaProcessor::runPrefixSum(prev, out); - + for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out += BlockSize) { + const uint8_t b = *bytep++; + const uint8_t cexcept = *bytep++; + for (int k = 0; k < BlockSize; k += 128) { + if (arraydispatch) + simdunpack(reinterpret_cast(in), out + k, b); + else + ArrayDispatch::SIMDunpack(reinterpret_cast(in), + out + k, b); + in += 4 * b; + } + if (cexcept > 0) { + const uint8_t maxbits = *bytep++; + if (maxbits - b == 1) { + for (uint32_t k = 0; k < cexcept; ++k) { + const uint8_t pos = *(bytep++); + out[pos] |= static_cast(1) << b; + } + } else { + const uint32_t *vals = unpackpointers[maxbits - b]; + unpackpointers[maxbits - b] += cexcept; + for (uint32_t k = 0; k < cexcept; ++k) { + const uint8_t pos = *(bytep++); + out[pos] |= vals[k] << b; + } } - - assert(in == headerin + wheremeta); + } + prev = + SIMDDeltaProcessor::runPrefixSum(prev, out); } - string name() const { - return string("SIMDFastPFor") + DeltaHelper::name(); - } + assert(in == headerin + wheremeta); + } + string name() const { return string("SIMDFastPFor") + DeltaHelper::name(); } }; - - - - - - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_SIMDFASTPFOR_H_ */ diff --git a/include/simdintegratedbitpacking.h b/include/simdintegratedbitpacking.h index f9d290d..9493fae 100644 --- a/include/simdintegratedbitpacking.h +++ b/include/simdintegratedbitpacking.h @@ -8,7 +8,6 @@ #ifndef SIMDCompressionAndIntersection_SIMD_INTEGRATED_BITPACKING_H #define SIMDCompressionAndIntersection_SIMD_INTEGRATED_BITPACKING_H - /** * To avoid crazy dependencies, this header should not * include any other header beside delta.h. @@ -17,684 +16,878 @@ namespace SIMDCompressionLib { - - -template -__m128i iunpack0(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack0(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack0(__m128i, const __m128i *, uint32_t *); template -void ipack0(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack0(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack0(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack1(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack1(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack1(__m128i, const __m128i *, uint32_t *); template -void ipack1(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack1(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack1(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack2(__m128i, const __m128i *, uint32_t *); +__m128i iunpack2(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack2(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack2(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack2(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack2(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack3(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack3(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack3(__m128i, const __m128i *, uint32_t *); template -void ipack3(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack3(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack3(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack4(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack4(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack4(__m128i, const __m128i *, uint32_t *); template -void ipack4(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack4(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack4(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack5(__m128i, const __m128i *, uint32_t *); +__m128i iunpack5(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack5(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack5(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack5(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack5(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack6(__m128i, const __m128i *, uint32_t *); +__m128i iunpack6(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack6(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack6(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack6(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack6(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack7(__m128i, const __m128i *, uint32_t *); +__m128i iunpack7(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack7(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack7(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack7(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack7(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack8(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack8(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack8(__m128i, const __m128i *, uint32_t *); template -void ipack8(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack8(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack8(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack9(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack9(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack9(__m128i, const __m128i *, uint32_t *); template -void ipack9(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack9(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack9(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack10(__m128i, const __m128i *, uint32_t *); +__m128i iunpack10(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack10(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack10(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack10(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack10(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack11(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack11(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack11(__m128i, const __m128i *, uint32_t *); template -void ipack11(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack11(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack11(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack12(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack12(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack12(__m128i, const __m128i *, uint32_t *); template -void ipack12(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack12(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack12(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack13(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack13(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack13(__m128i, const __m128i *, uint32_t *); template -void ipack13(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack13(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack13(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack14(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack14(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack14(__m128i, const __m128i *, uint32_t *); template -void ipack14(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack14(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack14(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack15(__m128i, const __m128i *, uint32_t *); +__m128i iunpack15(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack15(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack15(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack15(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack15(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack16(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack16(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack16(__m128i, const __m128i *, uint32_t *); template -void ipack16(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack16(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack16(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack17(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack17(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack17(__m128i, const __m128i *, uint32_t *); template -void ipack17(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack17(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack17(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack18(__m128i, const __m128i *, uint32_t *); +__m128i iunpack18(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack18(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack18(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack18(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack18(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack19(__m128i, const __m128i *, uint32_t *); +__m128i iunpack19(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack19(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack19(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack19(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack19(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack20(__m128i, const __m128i *, uint32_t *); +__m128i iunpack20(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack20(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack20(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack20(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack20(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack21(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack21(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack21(__m128i, const __m128i *, uint32_t *); template -void ipack21(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack21(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack21(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack22(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack22(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack22(__m128i, const __m128i *, uint32_t *); template -void ipack22(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack22(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack22(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack23(__m128i, const __m128i *, uint32_t *); +__m128i iunpack23(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack23(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack23(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack23(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack23(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack24(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack24(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack24(__m128i, const __m128i *, uint32_t *); template -void ipack24(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack24(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack24(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack25(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack25(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack25(__m128i, const __m128i *, uint32_t *); template -void ipack25(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack25(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack25(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack26(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack26(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack26(__m128i, const __m128i *, uint32_t *); template -void ipack26(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack26(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack26(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack27(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack27(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack27(__m128i, const __m128i *, uint32_t *); template -void ipack27(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack27(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack27(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack28(__m128i, const __m128i *, uint32_t *); +__m128i iunpack28(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack28(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack28(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack28(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack28(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack29(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack29(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack29(__m128i, const __m128i *, uint32_t *); template -void ipack29(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack29(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack29(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); - -template -__m128i iunpack30(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack30(__m128i, const __m128i *, uint32_t *, const __m128i *); +__m128i iunpack30(__m128i, const __m128i *, uint32_t *); template -void ipack30(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack30(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack30(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack31(__m128i, const __m128i *, uint32_t *); +__m128i iunpack31(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack31(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack31(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack31(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack31(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); - template -__m128i iunpack32(__m128i, const __m128i *, uint32_t *); +__m128i iunpack32(__m128i, const __m128i *, uint32_t *); template -__m128i ipatchedunpack32(__m128i, const __m128i *, uint32_t *, const __m128i *); -template -void ipack32(__m128i, const uint32_t *, __m128i *); +__m128i ipatchedunpack32(__m128i, const __m128i *, uint32_t *, const __m128i *); +template void ipack32(__m128i, const uint32_t *, __m128i *); template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); - -typedef __m128i(*integratedunpackingfunction)(__m128i, const __m128i *, uint32_t *); -typedef __m128i(*integratedpatchedunpackingfunction)(__m128i, const __m128i *, uint32_t *, const __m128i *); +typedef __m128i (*integratedunpackingfunction)(__m128i, const __m128i *, + uint32_t *); +typedef __m128i (*integratedpatchedunpackingfunction)(__m128i, const __m128i *, + uint32_t *, + const __m128i *); typedef void (*integratedpackingfunction)(__m128i, const uint32_t *, __m128i *); - -template -struct IntegratedArrayDispatch { - static integratedunpackingfunction unpack[33]; - - static inline __m128i SIMDiunpack(__m128i initOffset, const __m128i *in, uint32_t *out, const uint32_t bit) { - return unpack[bit](initOffset, in, out); - } - static integratedpatchedunpackingfunction patchedunpack[33]; - - static inline __m128i SIMDipatchedunpack(__m128i initOffset, const __m128i *in, uint32_t *out, - const __m128i *patchedbuffer, const uint32_t bit) { - return patchedunpack[bit](initOffset, in, out, patchedbuffer); - } - static integratedpackingfunction packwithoutmask[33]; - - static inline void SIMDipackwithoutmask(__m128i initOffset, const uint32_t *in, __m128i *out, - const uint32_t bit) { - packwithoutmask[bit](initOffset, in, out); - } - static integratedpackingfunction pack[33]; - - static inline void SIMDipack(__m128i initOffset, const uint32_t *in, __m128i *out, const uint32_t bit) { - pack[bit](initOffset, in, out); - } +template struct IntegratedArrayDispatch { + static integratedunpackingfunction unpack[33]; + + static inline __m128i SIMDiunpack(__m128i initOffset, const __m128i *in, + uint32_t *out, const uint32_t bit) { + return unpack[bit](initOffset, in, out); + } + static integratedpatchedunpackingfunction patchedunpack[33]; + + static inline __m128i SIMDipatchedunpack(__m128i initOffset, + const __m128i *in, uint32_t *out, + const __m128i *patchedbuffer, + const uint32_t bit) { + return patchedunpack[bit](initOffset, in, out, patchedbuffer); + } + static integratedpackingfunction packwithoutmask[33]; + + static inline void SIMDipackwithoutmask(__m128i initOffset, + const uint32_t *in, __m128i *out, + const uint32_t bit) { + packwithoutmask[bit](initOffset, in, out); + } + static integratedpackingfunction pack[33]; + + static inline void SIMDipack(__m128i initOffset, const uint32_t *in, + __m128i *out, const uint32_t bit) { + pack[bit](initOffset, in, out); + } }; template -integratedunpackingfunction IntegratedArrayDispatch::unpack[33] = {iunpack0, iunpack1, iunpack2, iunpack3, iunpack4, iunpack5, iunpack6, iunpack7, iunpack8, iunpack9, iunpack10, iunpack11, iunpack12, iunpack13, iunpack14, iunpack15, iunpack16, iunpack17, iunpack18, iunpack19, iunpack20, iunpack21, iunpack22, iunpack23, iunpack24, iunpack25, iunpack26, iunpack27, iunpack28, iunpack29, iunpack30, iunpack31, iunpack32}; +integratedunpackingfunction IntegratedArrayDispatch::unpack[33] = { + iunpack0, iunpack1, iunpack2, + iunpack3, iunpack4, iunpack5, + iunpack6, iunpack7, iunpack8, + iunpack9, iunpack10, iunpack11, + iunpack12, iunpack13, iunpack14, + iunpack15, iunpack16, iunpack17, + iunpack18, iunpack19, iunpack20, + iunpack21, iunpack22, iunpack23, + iunpack24, iunpack25, iunpack26, + iunpack27, iunpack28, iunpack29, + iunpack30, iunpack31, iunpack32}; + +template +integratedpatchedunpackingfunction + IntegratedArrayDispatch::patchedunpack[33] = { + ipatchedunpack0, ipatchedunpack1, + ipatchedunpack2, ipatchedunpack3, + ipatchedunpack4, ipatchedunpack5, + ipatchedunpack6, ipatchedunpack7, + ipatchedunpack8, ipatchedunpack9, + ipatchedunpack10, ipatchedunpack11, + ipatchedunpack12, ipatchedunpack13, + ipatchedunpack14, ipatchedunpack15, + ipatchedunpack16, ipatchedunpack17, + ipatchedunpack18, ipatchedunpack19, + ipatchedunpack20, ipatchedunpack21, + ipatchedunpack22, ipatchedunpack23, + ipatchedunpack24, ipatchedunpack25, + ipatchedunpack26, ipatchedunpack27, + ipatchedunpack28, ipatchedunpack29, + ipatchedunpack30, ipatchedunpack31, + ipatchedunpack32}; + +template +integratedpackingfunction + IntegratedArrayDispatch::packwithoutmask[33] = { + ipackwithoutmask0, ipackwithoutmask1, + ipackwithoutmask2, ipackwithoutmask3, + ipackwithoutmask4, ipackwithoutmask5, + ipackwithoutmask6, ipackwithoutmask7, + ipackwithoutmask8, ipackwithoutmask9, + ipackwithoutmask10, ipackwithoutmask11, + ipackwithoutmask12, ipackwithoutmask13, + ipackwithoutmask14, ipackwithoutmask15, + ipackwithoutmask16, ipackwithoutmask17, + ipackwithoutmask18, ipackwithoutmask19, + ipackwithoutmask20, ipackwithoutmask21, + ipackwithoutmask22, ipackwithoutmask23, + ipackwithoutmask24, ipackwithoutmask25, + ipackwithoutmask26, ipackwithoutmask27, + ipackwithoutmask28, ipackwithoutmask29, + ipackwithoutmask30, ipackwithoutmask31, + ipackwithoutmask32}; + +template +integratedpackingfunction IntegratedArrayDispatch::pack[33] = { + ipack0, ipack1, ipack2, + ipack3, ipack4, ipack5, + ipack6, ipack7, ipack8, + ipack9, ipack10, ipack11, + ipack12, ipack13, ipack14, + ipack15, ipack16, ipack17, + ipack18, ipack19, ipack20, + ipack21, ipack22, ipack23, + ipack24, ipack25, ipack26, + ipack27, ipack28, ipack29, + ipack30, ipack31, ipack32}; -template -integratedpatchedunpackingfunction IntegratedArrayDispatch::patchedunpack[33] = {ipatchedunpack0, ipatchedunpack1, ipatchedunpack2, ipatchedunpack3, ipatchedunpack4, ipatchedunpack5, ipatchedunpack6, ipatchedunpack7, ipatchedunpack8, ipatchedunpack9, ipatchedunpack10, ipatchedunpack11, ipatchedunpack12, ipatchedunpack13, ipatchedunpack14, ipatchedunpack15, ipatchedunpack16, ipatchedunpack17, ipatchedunpack18, ipatchedunpack19, ipatchedunpack20, ipatchedunpack21, ipatchedunpack22, ipatchedunpack23, ipatchedunpack24, ipatchedunpack25, ipatchedunpack26, ipatchedunpack27, ipatchedunpack28, ipatchedunpack29, ipatchedunpack30, ipatchedunpack31, ipatchedunpack32}; +template +inline __m128i SIMDiunpack(__m128i initOffset, const __m128i *in, uint32_t *out, + const uint32_t bit) { + switch (bit) { + case 0: + return iunpack0(initOffset, in, out); + + case 1: + return iunpack1(initOffset, in, out); + + case 2: + return iunpack2(initOffset, in, out); + + case 3: + return iunpack3(initOffset, in, out); + + case 4: + return iunpack4(initOffset, in, out); + + case 5: + return iunpack5(initOffset, in, out); + + case 6: + return iunpack6(initOffset, in, out); + + case 7: + return iunpack7(initOffset, in, out); + + case 8: + return iunpack8(initOffset, in, out); + + case 9: + return iunpack9(initOffset, in, out); + + case 10: + return iunpack10(initOffset, in, out); + + case 11: + return iunpack11(initOffset, in, out); + + case 12: + return iunpack12(initOffset, in, out); -template -integratedpackingfunction IntegratedArrayDispatch::packwithoutmask[33] = {ipackwithoutmask0, ipackwithoutmask1, ipackwithoutmask2, ipackwithoutmask3, ipackwithoutmask4, ipackwithoutmask5, ipackwithoutmask6, ipackwithoutmask7, ipackwithoutmask8, ipackwithoutmask9, ipackwithoutmask10, ipackwithoutmask11, ipackwithoutmask12, ipackwithoutmask13, ipackwithoutmask14, ipackwithoutmask15, ipackwithoutmask16, ipackwithoutmask17, ipackwithoutmask18, ipackwithoutmask19, ipackwithoutmask20, ipackwithoutmask21, ipackwithoutmask22, ipackwithoutmask23, ipackwithoutmask24, ipackwithoutmask25, ipackwithoutmask26, ipackwithoutmask27, ipackwithoutmask28, ipackwithoutmask29, ipackwithoutmask30, ipackwithoutmask31, ipackwithoutmask32}; + case 13: + return iunpack13(initOffset, in, out); + + case 14: + return iunpack14(initOffset, in, out); -template -integratedpackingfunction IntegratedArrayDispatch::pack[33] = {ipack0, ipack1, ipack2, ipack3, ipack4, ipack5, ipack6, ipack7, ipack8, ipack9, ipack10, ipack11, ipack12, ipack13, ipack14, ipack15, ipack16, ipack17, ipack18, ipack19, ipack20, ipack21, ipack22, ipack23, ipack24, ipack25, ipack26, ipack27, ipack28, ipack29, ipack30, ipack31, ipack32}; + case 15: + return iunpack15(initOffset, in, out); + case 16: + return iunpack16(initOffset, in, out); + case 17: + return iunpack17(initOffset, in, out); -template -inline __m128i SIMDiunpack(__m128i initOffset, const __m128i *in, uint32_t *out, const uint32_t bit) { - switch (bit) { - case 0: return iunpack0(initOffset, in, out); - - case 1: return iunpack1(initOffset, in, out); - - case 2: return iunpack2(initOffset, in, out); - - case 3: return iunpack3(initOffset, in, out); - - case 4: return iunpack4(initOffset, in, out); - - case 5: return iunpack5(initOffset, in, out); - - case 6: return iunpack6(initOffset, in, out); - - case 7: return iunpack7(initOffset, in, out); - - case 8: return iunpack8(initOffset, in, out); - - case 9: return iunpack9(initOffset, in, out); - - case 10: return iunpack10(initOffset, in, out); - - case 11: return iunpack11(initOffset, in, out); - - case 12: return iunpack12(initOffset, in, out); + case 18: + return iunpack18(initOffset, in, out); - case 13: return iunpack13(initOffset, in, out); + case 19: + return iunpack19(initOffset, in, out); - case 14: return iunpack14(initOffset, in, out); + case 20: + return iunpack20(initOffset, in, out); - case 15: return iunpack15(initOffset, in, out); + case 21: + return iunpack21(initOffset, in, out); - case 16: return iunpack16(initOffset, in, out); + case 22: + return iunpack22(initOffset, in, out); - case 17: return iunpack17(initOffset, in, out); + case 23: + return iunpack23(initOffset, in, out); - case 18: return iunpack18(initOffset, in, out); + case 24: + return iunpack24(initOffset, in, out); - case 19: return iunpack19(initOffset, in, out); + case 25: + return iunpack25(initOffset, in, out); - case 20: return iunpack20(initOffset, in, out); + case 26: + return iunpack26(initOffset, in, out); - case 21: return iunpack21(initOffset, in, out); + case 27: + return iunpack27(initOffset, in, out); - case 22: return iunpack22(initOffset, in, out); + case 28: + return iunpack28(initOffset, in, out); - case 23: return iunpack23(initOffset, in, out); + case 29: + return iunpack29(initOffset, in, out); - case 24: return iunpack24(initOffset, in, out); + case 30: + return iunpack30(initOffset, in, out); - case 25: return iunpack25(initOffset, in, out); + case 31: + return iunpack31(initOffset, in, out); - case 26: return iunpack26(initOffset, in, out); + case 32: + return iunpack32(initOffset, in, out); - case 27: return iunpack27(initOffset, in, out); - - case 28: return iunpack28(initOffset, in, out); - - case 29: return iunpack29(initOffset, in, out); - - case 30: return iunpack30(initOffset, in, out); - - case 31: return iunpack31(initOffset, in, out); - - case 32: return iunpack32(initOffset, in, out); - - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } - template -inline __m128i SIMDipatchedunpack(__m128i initOffset, const __m128i *in, uint32_t *out, - const __m128i *patchedbuffer, const uint32_t bit) { - switch (bit) { - case 0: return ipatchedunpack0(initOffset, in, out, patchedbuffer); +inline __m128i SIMDipatchedunpack(__m128i initOffset, const __m128i *in, + uint32_t *out, const __m128i *patchedbuffer, + const uint32_t bit) { + switch (bit) { + case 0: + return ipatchedunpack0(initOffset, in, out, patchedbuffer); - case 1: return ipatchedunpack1(initOffset, in, out, patchedbuffer); + case 1: + return ipatchedunpack1(initOffset, in, out, patchedbuffer); - case 2: return ipatchedunpack2(initOffset, in, out, patchedbuffer); + case 2: + return ipatchedunpack2(initOffset, in, out, patchedbuffer); - case 3: return ipatchedunpack3(initOffset, in, out, patchedbuffer); + case 3: + return ipatchedunpack3(initOffset, in, out, patchedbuffer); - case 4: return ipatchedunpack4(initOffset, in, out, patchedbuffer); + case 4: + return ipatchedunpack4(initOffset, in, out, patchedbuffer); - case 5: return ipatchedunpack5(initOffset, in, out, patchedbuffer); + case 5: + return ipatchedunpack5(initOffset, in, out, patchedbuffer); - case 6: return ipatchedunpack6(initOffset, in, out, patchedbuffer); + case 6: + return ipatchedunpack6(initOffset, in, out, patchedbuffer); - case 7: return ipatchedunpack7(initOffset, in, out, patchedbuffer); + case 7: + return ipatchedunpack7(initOffset, in, out, patchedbuffer); - case 8: return ipatchedunpack8(initOffset, in, out, patchedbuffer); + case 8: + return ipatchedunpack8(initOffset, in, out, patchedbuffer); - case 9: return ipatchedunpack9(initOffset, in, out, patchedbuffer); + case 9: + return ipatchedunpack9(initOffset, in, out, patchedbuffer); - case 10: return ipatchedunpack10(initOffset, in, out, patchedbuffer); + case 10: + return ipatchedunpack10(initOffset, in, out, patchedbuffer); - case 11: return ipatchedunpack11(initOffset, in, out, patchedbuffer); + case 11: + return ipatchedunpack11(initOffset, in, out, patchedbuffer); - case 12: return ipatchedunpack12(initOffset, in, out, patchedbuffer); + case 12: + return ipatchedunpack12(initOffset, in, out, patchedbuffer); - case 13: return ipatchedunpack13(initOffset, in, out, patchedbuffer); + case 13: + return ipatchedunpack13(initOffset, in, out, patchedbuffer); - case 14: return ipatchedunpack14(initOffset, in, out, patchedbuffer); + case 14: + return ipatchedunpack14(initOffset, in, out, patchedbuffer); - case 15: return ipatchedunpack15(initOffset, in, out, patchedbuffer); + case 15: + return ipatchedunpack15(initOffset, in, out, patchedbuffer); - case 16: return ipatchedunpack16(initOffset, in, out, patchedbuffer); + case 16: + return ipatchedunpack16(initOffset, in, out, patchedbuffer); - case 17: return ipatchedunpack17(initOffset, in, out, patchedbuffer); + case 17: + return ipatchedunpack17(initOffset, in, out, patchedbuffer); - case 18: return ipatchedunpack18(initOffset, in, out, patchedbuffer); + case 18: + return ipatchedunpack18(initOffset, in, out, patchedbuffer); - case 19: return ipatchedunpack19(initOffset, in, out, patchedbuffer); + case 19: + return ipatchedunpack19(initOffset, in, out, patchedbuffer); - case 20: return ipatchedunpack20(initOffset, in, out, patchedbuffer); + case 20: + return ipatchedunpack20(initOffset, in, out, patchedbuffer); - case 21: return ipatchedunpack21(initOffset, in, out, patchedbuffer); + case 21: + return ipatchedunpack21(initOffset, in, out, patchedbuffer); - case 22: return ipatchedunpack22(initOffset, in, out, patchedbuffer); + case 22: + return ipatchedunpack22(initOffset, in, out, patchedbuffer); - case 23: return ipatchedunpack23(initOffset, in, out, patchedbuffer); + case 23: + return ipatchedunpack23(initOffset, in, out, patchedbuffer); - case 24: return ipatchedunpack24(initOffset, in, out, patchedbuffer); + case 24: + return ipatchedunpack24(initOffset, in, out, patchedbuffer); - case 25: return ipatchedunpack25(initOffset, in, out, patchedbuffer); + case 25: + return ipatchedunpack25(initOffset, in, out, patchedbuffer); - case 26: return ipatchedunpack26(initOffset, in, out, patchedbuffer); + case 26: + return ipatchedunpack26(initOffset, in, out, patchedbuffer); - case 27: return ipatchedunpack27(initOffset, in, out, patchedbuffer); + case 27: + return ipatchedunpack27(initOffset, in, out, patchedbuffer); - case 28: return ipatchedunpack28(initOffset, in, out, patchedbuffer); + case 28: + return ipatchedunpack28(initOffset, in, out, patchedbuffer); - case 29: return ipatchedunpack29(initOffset, in, out, patchedbuffer); + case 29: + return ipatchedunpack29(initOffset, in, out, patchedbuffer); - case 30: return ipatchedunpack30(initOffset, in, out, patchedbuffer); + case 30: + return ipatchedunpack30(initOffset, in, out, patchedbuffer); - case 31: return ipatchedunpack31(initOffset, in, out, patchedbuffer); + case 31: + return ipatchedunpack31(initOffset, in, out, patchedbuffer); - case 32: return ipatchedunpack32(initOffset, in, out, patchedbuffer); + case 32: + return ipatchedunpack32(initOffset, in, out, patchedbuffer); - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } - /*assumes that integers fit in the prescribed number of bits*/ template -void SIMDipackwithoutmask(__m128i initOffset, const uint32_t *in, __m128i *out, const uint32_t bit) { - switch (bit) { - case 0: return; - - case 1: ipackwithoutmask1(initOffset, in, out); return; +void SIMDipackwithoutmask(__m128i initOffset, const uint32_t *in, __m128i *out, + const uint32_t bit) { + switch (bit) { + case 0: + return; - case 2: ipackwithoutmask2(initOffset, in, out); return; + case 1: + ipackwithoutmask1(initOffset, in, out); + return; - case 3: ipackwithoutmask3(initOffset, in, out); return; + case 2: + ipackwithoutmask2(initOffset, in, out); + return; - case 4: ipackwithoutmask4(initOffset, in, out); return; + case 3: + ipackwithoutmask3(initOffset, in, out); + return; - case 5: ipackwithoutmask5(initOffset, in, out); return; + case 4: + ipackwithoutmask4(initOffset, in, out); + return; - case 6: ipackwithoutmask6(initOffset, in, out); return; + case 5: + ipackwithoutmask5(initOffset, in, out); + return; - case 7: ipackwithoutmask7(initOffset, in, out); return; + case 6: + ipackwithoutmask6(initOffset, in, out); + return; - case 8: ipackwithoutmask8(initOffset, in, out); return; + case 7: + ipackwithoutmask7(initOffset, in, out); + return; - case 9: ipackwithoutmask9(initOffset, in, out); return; + case 8: + ipackwithoutmask8(initOffset, in, out); + return; - case 10: ipackwithoutmask10(initOffset, in, out); return; + case 9: + ipackwithoutmask9(initOffset, in, out); + return; - case 11: ipackwithoutmask11(initOffset, in, out); return; + case 10: + ipackwithoutmask10(initOffset, in, out); + return; - case 12: ipackwithoutmask12(initOffset, in, out); return; + case 11: + ipackwithoutmask11(initOffset, in, out); + return; - case 13: ipackwithoutmask13(initOffset, in, out); return; + case 12: + ipackwithoutmask12(initOffset, in, out); + return; - case 14: ipackwithoutmask14(initOffset, in, out); return; + case 13: + ipackwithoutmask13(initOffset, in, out); + return; - case 15: ipackwithoutmask15(initOffset, in, out); return; + case 14: + ipackwithoutmask14(initOffset, in, out); + return; - case 16: ipackwithoutmask16(initOffset, in, out); return; + case 15: + ipackwithoutmask15(initOffset, in, out); + return; - case 17: ipackwithoutmask17(initOffset, in, out); return; + case 16: + ipackwithoutmask16(initOffset, in, out); + return; - case 18: ipackwithoutmask18(initOffset, in, out); return; + case 17: + ipackwithoutmask17(initOffset, in, out); + return; - case 19: ipackwithoutmask19(initOffset, in, out); return; + case 18: + ipackwithoutmask18(initOffset, in, out); + return; - case 20: ipackwithoutmask20(initOffset, in, out); return; + case 19: + ipackwithoutmask19(initOffset, in, out); + return; - case 21: ipackwithoutmask21(initOffset, in, out); return; + case 20: + ipackwithoutmask20(initOffset, in, out); + return; - case 22: ipackwithoutmask22(initOffset, in, out); return; + case 21: + ipackwithoutmask21(initOffset, in, out); + return; - case 23: ipackwithoutmask23(initOffset, in, out); return; + case 22: + ipackwithoutmask22(initOffset, in, out); + return; - case 24: ipackwithoutmask24(initOffset, in, out); return; + case 23: + ipackwithoutmask23(initOffset, in, out); + return; - case 25: ipackwithoutmask25(initOffset, in, out); return; + case 24: + ipackwithoutmask24(initOffset, in, out); + return; + + case 25: + ipackwithoutmask25(initOffset, in, out); + return; - case 26: ipackwithoutmask26(initOffset, in, out); return; + case 26: + ipackwithoutmask26(initOffset, in, out); + return; - case 27: ipackwithoutmask27(initOffset, in, out); return; + case 27: + ipackwithoutmask27(initOffset, in, out); + return; - case 28: ipackwithoutmask28(initOffset, in, out); return; + case 28: + ipackwithoutmask28(initOffset, in, out); + return; - case 29: ipackwithoutmask29(initOffset, in, out); return; + case 29: + ipackwithoutmask29(initOffset, in, out); + return; - case 30: ipackwithoutmask30(initOffset, in, out); return; + case 30: + ipackwithoutmask30(initOffset, in, out); + return; - case 31: ipackwithoutmask31(initOffset, in, out); return; + case 31: + ipackwithoutmask31(initOffset, in, out); + return; - case 32: ipackwithoutmask32(initOffset, in, out); return; + case 32: + ipackwithoutmask32(initOffset, in, out); + return; - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } - - template -void SIMDipack(__m128i initOffset, const uint32_t *in, __m128i *out, const uint32_t bit) { - switch (bit) { - case 0: return; - - case 1: ipack1(initOffset, in, out); return; +void SIMDipack(__m128i initOffset, const uint32_t *in, __m128i *out, + const uint32_t bit) { + switch (bit) { + case 0: + return; - case 2: ipack2(initOffset, in, out); return; + case 1: + ipack1(initOffset, in, out); + return; - case 3: ipack3(initOffset, in, out); return; + case 2: + ipack2(initOffset, in, out); + return; - case 4: ipack4(initOffset, in, out); return; + case 3: + ipack3(initOffset, in, out); + return; - case 5: ipack5(initOffset, in, out); return; + case 4: + ipack4(initOffset, in, out); + return; - case 6: ipack6(initOffset, in, out); return; + case 5: + ipack5(initOffset, in, out); + return; - case 7: ipack7(initOffset, in, out); return; + case 6: + ipack6(initOffset, in, out); + return; - case 8: ipack8(initOffset, in, out); return; + case 7: + ipack7(initOffset, in, out); + return; - case 9: ipack9(initOffset, in, out); return; + case 8: + ipack8(initOffset, in, out); + return; - case 10: ipack10(initOffset, in, out); return; + case 9: + ipack9(initOffset, in, out); + return; - case 11: ipack11(initOffset, in, out); return; + case 10: + ipack10(initOffset, in, out); + return; - case 12: ipack12(initOffset, in, out); return; + case 11: + ipack11(initOffset, in, out); + return; - case 13: ipack13(initOffset, in, out); return; + case 12: + ipack12(initOffset, in, out); + return; - case 14: ipack14(initOffset, in, out); return; + case 13: + ipack13(initOffset, in, out); + return; - case 15: ipack15(initOffset, in, out); return; + case 14: + ipack14(initOffset, in, out); + return; - case 16: ipack16(initOffset, in, out); return; + case 15: + ipack15(initOffset, in, out); + return; - case 17: ipack17(initOffset, in, out); return; + case 16: + ipack16(initOffset, in, out); + return; - case 18: ipack18(initOffset, in, out); return; + case 17: + ipack17(initOffset, in, out); + return; - case 19: ipack19(initOffset, in, out); return; + case 18: + ipack18(initOffset, in, out); + return; - case 20: ipack20(initOffset, in, out); return; + case 19: + ipack19(initOffset, in, out); + return; - case 21: ipack21(initOffset, in, out); return; + case 20: + ipack20(initOffset, in, out); + return; - case 22: ipack22(initOffset, in, out); return; + case 21: + ipack21(initOffset, in, out); + return; - case 23: ipack23(initOffset, in, out); return; + case 22: + ipack22(initOffset, in, out); + return; - case 24: ipack24(initOffset, in, out); return; + case 23: + ipack23(initOffset, in, out); + return; - case 25: ipack25(initOffset, in, out); return; + case 24: + ipack24(initOffset, in, out); + return; + + case 25: + ipack25(initOffset, in, out); + return; - case 26: ipack26(initOffset, in, out); return; + case 26: + ipack26(initOffset, in, out); + return; - case 27: ipack27(initOffset, in, out); return; + case 27: + ipack27(initOffset, in, out); + return; - case 28: ipack28(initOffset, in, out); return; + case 28: + ipack28(initOffset, in, out); + return; - case 29: ipack29(initOffset, in, out); return; + case 29: + ipack29(initOffset, in, out); + return; - case 30: ipack30(initOffset, in, out); return; + case 30: + ipack30(initOffset, in, out); + return; - case 31: ipack31(initOffset, in, out); return; + case 31: + ipack31(initOffset, in, out); + return; - case 32: ipack32(initOffset, in, out); return; + case 32: + ipack32(initOffset, in, out); + return; - default: break; - } - throw std::logic_error("number of bits is unsupported"); + default: + break; + } + throw std::logic_error("number of bits is unsupported"); } } // namespace SIMDCompressionLib diff --git a/include/simdvariablebyte.h b/include/simdvariablebyte.h index b426a9e..b575c9e 100644 --- a/include/simdvariablebyte.h +++ b/include/simdvariablebyte.h @@ -18,472 +18,474 @@ namespace SIMDCompressionLib { extern "C" { -size_t masked_vbyte_read_loop(const uint8_t* in, uint32_t* out, uint64_t length); -size_t masked_vbyte_read_loop_delta(const uint8_t* in, uint32_t* out, uint64_t length, uint32_t prev); -size_t masked_vbyte_read_loop_fromcompressedsize(const uint8_t* in, uint32_t* out, - size_t inputsize); -size_t masked_vbyte_read_loop_fromcompressedsize_delta(const uint8_t* in, uint32_t* out, - size_t inputsize, uint32_t prev); -//size_t read_ints(const uint8_t* in, uint32_t* out, int length) ; -//size_t read_ints_delta(const uint8_t* in, uint32_t* out, int length, uint32_t prev) ; +size_t masked_vbyte_read_loop(const uint8_t *in, uint32_t *out, + uint64_t length); +size_t masked_vbyte_read_loop_delta(const uint8_t *in, uint32_t *out, + uint64_t length, uint32_t prev); +size_t masked_vbyte_read_loop_fromcompressedsize(const uint8_t *in, + uint32_t *out, + size_t inputsize); +size_t masked_vbyte_read_loop_fromcompressedsize_delta(const uint8_t *in, + uint32_t *out, + size_t inputsize, + uint32_t prev); +// size_t read_ints(const uint8_t* in, uint32_t* out, int length) ; +// size_t read_ints_delta(const uint8_t* in, uint32_t* out, int length, uint32_t +// prev) ; uint32_t masked_vbyte_select_delta(const uint8_t *in, uint64_t length, - uint32_t prev, size_t slot); + uint32_t prev, size_t slot); int masked_vbyte_search_delta(const uint8_t *in, uint64_t length, uint32_t prev, - uint32_t key, uint32_t *presult); + uint32_t key, uint32_t *presult); } - /** - * This is a SIMD-accelerated version that is byte-by-byte format compatible with + * This is a SIMD-accelerated version that is byte-by-byte format compatible + * with * the VByte codec (that is, standard vbyte). */ -template -class MaskedVByte: public IntegerCODEC { +template class MaskedVByte : public IntegerCODEC { public: - MaskedVByte() { - } - - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - const uint8_t * const initbout = reinterpret_cast (out); - *out = static_cast(length); - out++; - uint8_t * bout = reinterpret_cast (out); - uint32_t prev = 0; - for (size_t k = 0; k < length; ++k) { - - const uint32_t val = delta ? (in[k] - prev) : in[k]; - if(delta) prev = in[k]; - /** - * Code below could be shorter. Whether it could be faster - * depends on your compiler and machine. - */ - if (val < (1U << 7)) { - *bout = val & 0x7F; - ++bout; - } else if (val < (1U << 14)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 14); - ++bout; - } else if (val < (1U << 28)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 21); - ++bout; - } else { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 28); - ++bout; - } - } - while (needPaddingTo32Bits(bout)) { - *bout++ = 0xff; - } - const size_t storageinbytes = bout - initbout; - nvalue = storageinbytes/4; - } - - const uint32_t * decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t & nvalue) { - nvalue = *in; - const uint8_t *inbyte = decodeFromByteArray((const uint8_t *)in, - length, out, nvalue); - inbyte = padTo32bits(inbyte); - return reinterpret_cast (inbyte); + MaskedVByte() {} + + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + const uint8_t *const initbout = reinterpret_cast(out); + *out = static_cast(length); + out++; + uint8_t *bout = reinterpret_cast(out); + uint32_t prev = 0; + for (size_t k = 0; k < length; ++k) { + + const uint32_t val = delta ? (in[k] - prev) : in[k]; + if (delta) + prev = in[k]; + /** + * Code below could be shorter. Whether it could be faster + * depends on your compiler and machine. + */ + if (val < (1U << 7)) { + *bout = val & 0x7F; + ++bout; + } else if (val < (1U << 14)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 14); + ++bout; + } else if (val < (1U << 28)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 21); + ++bout; + } else { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 28); + ++bout; + } } - - // Same as above, but operates on byte arrays (uint8_t *) and avoids - // the padding at the end - const uint8_t *decodeFromByteArray(const uint8_t *in, - const size_t /* length */, - uint32_t *out, size_t & nvalue) { - nvalue = *(uint32_t *)in; - in += sizeof(uint32_t); - if (nvalue == 0) { - return in;//abort - } - if(delta) { - uint32_t prev = 0; - in += masked_vbyte_read_loop_delta(in, out, nvalue, prev); - } else - in += masked_vbyte_read_loop(in, out, nvalue); - return in; + while (needPaddingTo32Bits(bout)) { + *bout++ = 0xff; } - - // append a key. Keys must be in sorted order. We assume that there is - // enough room and that delta encoding was used. - // Returns the new size of the compressed array *in bytes* - size_t appendToByteArray(uint8_t *in, size_t bytesize, uint32_t previous_key, - uint32_t key) { - uint32_t num_ints = *(uint32_t *)in; - if (bytesize == 0) - bytesize = 4; - uint8_t *bytein = in + bytesize; - bytein += encodeOneIntegerToByteArray(key - previous_key, bytein); - *(uint32_t *)in = num_ints + 1; - return bytein - in; + const size_t storageinbytes = bout - initbout; + nvalue = storageinbytes / 4; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + nvalue = *in; + const uint8_t *inbyte = + decodeFromByteArray((const uint8_t *)in, length, out, nvalue); + inbyte = padTo32bits(inbyte); + return reinterpret_cast(inbyte); + } + + // Same as above, but operates on byte arrays (uint8_t *) and avoids + // the padding at the end + const uint8_t *decodeFromByteArray(const uint8_t *in, + const size_t /* length */, uint32_t *out, + size_t &nvalue) { + nvalue = *(uint32_t *)in; + in += sizeof(uint32_t); + if (nvalue == 0) { + return in; // abort } - - // Returns a decompressed value in a delta-encoded array - // only supported for delta encoded data (TODO) - uint32_t select(uint32_t *in, size_t index) { - assert(delta == true); - uint32_t num_ints = *in; - in++; - return (masked_vbyte_select_delta((uint8_t *)in, num_ints, 0, index)); + if (delta) { + uint32_t prev = 0; + in += masked_vbyte_read_loop_delta(in, out, nvalue, prev); + } else + in += masked_vbyte_read_loop(in, out, nvalue); + return in; + } + + // append a key. Keys must be in sorted order. We assume that there is + // enough room and that delta encoding was used. + // Returns the new size of the compressed array *in bytes* + size_t appendToByteArray(uint8_t *in, size_t bytesize, uint32_t previous_key, + uint32_t key) { + uint32_t num_ints = *(uint32_t *)in; + if (bytesize == 0) + bytesize = 4; + uint8_t *bytein = in + bytesize; + bytein += encodeOneIntegerToByteArray(key - previous_key, bytein); + *(uint32_t *)in = num_ints + 1; + return bytein - in; + } + + // Returns a decompressed value in a delta-encoded array + // only supported for delta encoded data (TODO) + uint32_t select(uint32_t *in, size_t index) { + assert(delta == true); + uint32_t num_ints = *in; + in++; + return (masked_vbyte_select_delta((uint8_t *)in, num_ints, 0, index)); + } + + // Performs a lower bound find in the delta-encoded array. + // Returns the index + // length is the size of the compressed input + // only supported for delta encoded data (TODO) + size_t findLowerBound(const uint32_t *in, const size_t /*length*/, + uint32_t key, uint32_t *presult) { + assert(delta == true); + uint32_t num_ints = *in; + in++; + return ( + masked_vbyte_search_delta((uint8_t *)in, num_ints, 0, key, presult)); + } + // insert the key in sorted order. We assume that there is enough room + // and that delta encoding was used. + size_t insert(uint32_t *in, const size_t length, uint32_t key) { + assert(delta); + size_t bytesize = length * 4; + bytesize -= paddingBytes(in, length); + uint8_t *bytein = (uint8_t *)in; + uint8_t *byteininit = bytein; + bytein += insert(bytein, bytesize, key); + + while (needPaddingTo32Bits(bytein)) { + *bytein++ = 0xFF; } - - // Performs a lower bound find in the delta-encoded array. - // Returns the index - // length is the size of the compressed input - // only supported for delta encoded data (TODO) - size_t findLowerBound(const uint32_t *in, const size_t /*length*/, - uint32_t key, uint32_t *presult) { - assert(delta == true); - uint32_t num_ints = *in; - in++; - return (masked_vbyte_search_delta((uint8_t *)in, num_ints, - 0, key, presult)); - } - // insert the key in sorted order. We assume that there is enough room - // and that delta encoding was used. - size_t insert(uint32_t *in, const size_t length, uint32_t key) { - assert(delta); - size_t bytesize = length * 4; - bytesize -= paddingBytes(in,length); - uint8_t * bytein = (uint8_t *) in; - uint8_t * byteininit = bytein; - bytein += insert(bytein, bytesize, key); - - while (needPaddingTo32Bits(bytein)) { - *bytein++ = 0xFF; + size_t storageinbytes = bytein - byteininit; + assert((storageinbytes % 4) == 0); + return storageinbytes / 4; + } + + // insert the key in sorted order. We assume that there is enough room and + // that delta encoding was used. + // the new size (in *byte) is returned + size_t insertInByteArray(uint8_t *inbyte, const size_t length, uint32_t key) { + uint32_t prev = 0; + assert(delta); + const uint8_t *const endbyte = + reinterpret_cast(inbyte + length); + // this assumes that there is a value to be read + + while (endbyte > inbyte + 5) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c < 128) { + inbyte += 1; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); } - size_t storageinbytes = bytein - byteininit; - assert((storageinbytes % 4) == 0); - return storageinbytes / 4; - } - - // insert the key in sorted order. We assume that there is enough room and that delta encoding was used. - // the new size (in *byte) is returned - size_t insertInByteArray(uint8_t *inbyte, const size_t length, uint32_t key) { - uint32_t prev = 0; - assert(delta); - const uint8_t * const endbyte = reinterpret_cast(inbyte - + length); - // this assumes that there is a value to be read - - while (endbyte > inbyte + 5) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c < 128) { - inbyte += 1; - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c < 128) { - inbyte += 2; - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c < 128) { - inbyte += 3; - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c < 128) { - inbyte += 4; - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - prev = v + prev; - if (prev >= key) { - return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - } - while (endbyte > inbyte) { - unsigned int shift = 0; - for (uint32_t v = 0; endbyte > inbyte; shift += 7) { - uint8_t c = *inbyte++; - v += ((c & 127) << shift); - if ((c < 128)) { - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, - endbyte - inbyte); - - } - break; - } - } - } - // if we make it here, then we need to append - assert(key >= prev); - return length + encodeOneIntegerToByteArray(key - prev, inbyte); - } - - std::string name() const { - if (delta) - return "MaskedVByteDelta"; - else - return "MaskedVByte"; - } - -private: - - - // convenience function used by insert, writes key and newvalue to compressed stream, and return - // extra storage used, pointer should be right after where nextvalue is right now - size_t __insert(uint8_t *in, uint32_t previous, uint32_t key, uint32_t nextvalue, size_t followingbytes) { - assert(nextvalue >= key); - assert(key >= previous); - size_t oldstorage = storageCost(nextvalue - previous); - size_t newstorage = storageCost(nextvalue - key) + storageCost(key - previous); - assert(newstorage >= oldstorage); - if(newstorage > oldstorage) std::memmove(in + newstorage - oldstorage, in,followingbytes); - uint8_t *newin = in - oldstorage; - newin += encodeOneIntegerToByteArray(key - previous, newin); - newin += encodeOneIntegerToByteArray(nextvalue - key, newin); - assert(newin == in + newstorage - oldstorage); - return newstorage - oldstorage; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c < 128) { + inbyte += 2; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c < 128) { + inbyte += 3; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c < 128) { + inbyte += 4; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + prev = v + prev; + if (prev >= key) { + return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } } - - - // how many bytes are required to store this integer? - int storageCost(uint32_t val) { - if (val < (1U << 7)) { - return 1; - } else if (val < (1U << 14)) { - return 2; - } else if (val < (1U << 21)) { - return 3; - } else if (val < (1U << 28)) { - return 4; - } else { - return 5; + while (endbyte > inbyte) { + unsigned int shift = 0; + for (uint32_t v = 0; endbyte > inbyte; shift += 7) { + uint8_t c = *inbyte++; + v += ((c & 127) << shift); + if ((c < 128)) { + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + break; } + } } + // if we make it here, then we need to append + assert(key >= prev); + return length + encodeOneIntegerToByteArray(key - prev, inbyte); + } + + std::string name() const { + if (delta) + return "MaskedVByteDelta"; + else + return "MaskedVByte"; + } - template - uint8_t extract7bits(const uint32_t val) { - return static_cast((val >>(7 * i)) & ((1U << 7) - 1)); +private: + // convenience function used by insert, writes key and newvalue to compressed + // stream, and return + // extra storage used, pointer should be right after where nextvalue is right + // now + size_t __insert(uint8_t *in, uint32_t previous, uint32_t key, + uint32_t nextvalue, size_t followingbytes) { + assert(nextvalue >= key); + assert(key >= previous); + size_t oldstorage = storageCost(nextvalue - previous); + size_t newstorage = + storageCost(nextvalue - key) + storageCost(key - previous); + assert(newstorage >= oldstorage); + if (newstorage > oldstorage) + std::memmove(in + newstorage - oldstorage, in, followingbytes); + uint8_t *newin = in - oldstorage; + newin += encodeOneIntegerToByteArray(key - previous, newin); + newin += encodeOneIntegerToByteArray(nextvalue - key, newin); + assert(newin == in + newstorage - oldstorage); + return newstorage - oldstorage; + } + + // how many bytes are required to store this integer? + int storageCost(uint32_t val) { + if (val < (1U << 7)) { + return 1; + } else if (val < (1U << 14)) { + return 2; + } else if (val < (1U << 21)) { + return 3; + } else if (val < (1U << 28)) { + return 4; + } else { + return 5; } - - template - uint8_t extract7bitsmaskless(const uint32_t val) { - return static_cast((val >>(7 * i))); + } + + template uint8_t extract7bits(const uint32_t val) { + return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); + } + + template uint8_t extract7bitsmaskless(const uint32_t val) { + return static_cast((val >> (7 * i))); + } + + // determine how many padding bytes were used + int paddingBytes(const uint32_t *in, const size_t length) { + if (length == 0) + return 0; + uint32_t lastword = in[length - 1]; + if (lastword < (1U << 8)) { + return 3; + } else if (lastword < (1U << 16)) { + return 2; + } else if (lastword < (1U << 24)) { + return 1; } - - // determine how many padding bytes were used - int paddingBytes(const uint32_t *in, const size_t length) { - if(length == 0) return 0; - uint32_t lastword = in[length - 1]; - if (lastword < (1U << 8)) { - return 3; - } else if (lastword < (1U << 16)) { - return 2; - } else if (lastword < (1U << 24)) { - return 1; - } - return 0; + return 0; + } + + // write one compressed integer (without differential coding) + // returns the number of bytes written + size_t encodeOneIntegerToByteArray(uint32_t val, uint8_t *bout) { + const uint8_t *const initbout = bout; + if (val < (1U << 7)) { + *bout = val & 0x7F; + ++bout; + } else if (val < (1U << 14)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 14); + ++bout; + } else if (val < (1U << 28)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 21); + ++bout; + } else { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 28); + ++bout; } - - // write one compressed integer (without differential coding) - // returns the number of bytes written - size_t encodeOneIntegerToByteArray(uint32_t val, uint8_t *bout) { - const uint8_t * const initbout = bout; - if (val < (1U << 7)) { - *bout = val & 0x7F; - ++bout; - } else if (val < (1U << 14)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 14); - ++bout; - } else if (val < (1U << 28)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 21); - ++bout; - } else { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 28); - ++bout; - } - return bout - initbout; - } + return bout - initbout; + } }; - // this version differs from MaskedVByte in that it does not write out the // number of integers compressed as part of a header. -template -class HeadlessMaskedVByte: public IntegerCODEC { +template class HeadlessMaskedVByte : public IntegerCODEC { public: - HeadlessMaskedVByte() { + HeadlessMaskedVByte() {} + + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + const uint8_t *const initbout = reinterpret_cast(out); + uint8_t *bout = reinterpret_cast(out); + uint32_t prev = 0; + for (size_t k = 0; k < length; ++k) { + const uint32_t val = delta ? (in[k] - prev) : in[k]; + if (delta) + prev = in[k]; + /** + * Code below could be shorter. Whether it could be faster + * depends on your compiler and machine. + */ + if (val < (1U << 7)) { + *bout = val & 0x7F; + ++bout; + } else if (val < (1U << 14)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 14); + ++bout; + } else if (val < (1U << 28)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 21); + ++bout; + } else { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 28); + ++bout; + } } - - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - const uint8_t * const initbout = reinterpret_cast (out); - uint8_t * bout = reinterpret_cast (out); - uint32_t prev = 0; - for (size_t k = 0; k < length; ++k) { - const uint32_t val = delta ? (in[k] - prev) : in[k]; - if(delta) prev = in[k]; - /** - * Code below could be shorter. Whether it could be faster - * depends on your compiler and machine. - */ - if (val < (1U << 7)) { - *bout = val & 0x7F; - ++bout; - } else if (val < (1U << 14)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 14); - ++bout; - } else if (val < (1U << 28)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 21); - ++bout; - } else { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 28); - ++bout; - } - } - while (needPaddingTo32Bits(bout)) { - *bout++ = 0xFFU;; - } - const size_t storageinbytes = bout - initbout; - nvalue = storageinbytes/4; + while (needPaddingTo32Bits(bout)) { + *bout++ = 0xFFU; + ; } - - const uint32_t * decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t & nvalue) { - if (length == 0) { - nvalue = 0; - return in;//abort - } - const uint8_t * inbyte = reinterpret_cast (in); - if(delta) { - uint32_t prev = 0; - nvalue = masked_vbyte_read_loop_fromcompressedsize_delta(inbyte, out, length * 4, prev); - } else { - nvalue = masked_vbyte_read_loop_fromcompressedsize(inbyte, out, length * 4); - } - - return in + length; + const size_t storageinbytes = bout - initbout; + nvalue = storageinbytes / 4; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + if (length == 0) { + nvalue = 0; + return in; // abort + } + const uint8_t *inbyte = reinterpret_cast(in); + if (delta) { + uint32_t prev = 0; + nvalue = masked_vbyte_read_loop_fromcompressedsize_delta( + inbyte, out, length * 4, prev); + } else { + nvalue = + masked_vbyte_read_loop_fromcompressedsize(inbyte, out, length * 4); } + return in + length; + } - - std::string name() const { - if (delta) - return "HeadlessMaskedVByteDelta"; - else - return "HeadlessMaskedVByte"; - } + std::string name() const { + if (delta) + return "HeadlessMaskedVByteDelta"; + else + return "HeadlessMaskedVByte"; + } private: - template - uint8_t extract7bits(const uint32_t val) { - return static_cast((val >>(7 * i)) & ((1U << 7) - 1)); - } + template uint8_t extract7bits(const uint32_t val) { + return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); + } - template - uint8_t extract7bitsmaskless(const uint32_t val) { - return static_cast((val >>(7 * i))); - } + template uint8_t extract7bitsmaskless(const uint32_t val) { + return static_cast((val >> (7 * i))); + } }; } // namespace SIMDCompressionLib diff --git a/include/skipping.h b/include/skipping.h index efdffd0..d46e643 100644 --- a/include/skipping.h +++ b/include/skipping.h @@ -1,13 +1,16 @@ /* - * This is a simple implementation of the Skipping data structure and algorithms similar to + * This is a simple implementation of the Skipping data structure and algorithms + * similar to * what is described in * * Sanders and Transier, Intersection in Integer Inverted Indices, 2007. * - * As suggested in their conclusion, we leave the higher-level structure uncompressed. We also + * As suggested in their conclusion, we leave the higher-level structure + * uncompressed. We also * use differential coding. * - * Sanders and Transier's proposal is similar in spirit to the skipping structure proposed in + * Sanders and Transier's proposal is similar in spirit to the skipping + * structure proposed in * * Moffat, A., Zobel, J.: Self-indexing inverted files for fast text retrieval. * ACM Transactions on Information Systems 14 (1996). @@ -25,252 +28,255 @@ namespace SIMDCompressionLib { class Skipping { public: + Skipping(uint32_t BS, const uint32_t *data, uint32_t length) + : BlockSizeLog(BS), mainbuffer(), highbuffer(), Length(0) { + if ((BlockSizeLog == 0) && (BlockSizeLog >= 32)) + throw runtime_error("please use a reasonable BlockSizeLog"); + load(data, length); // cheap constructor + } + ~Skipping() {} - Skipping(uint32_t BS, const uint32_t *data, uint32_t length) : - BlockSizeLog(BS), - mainbuffer(), highbuffer(), Length(0) { - if ((BlockSizeLog == 0) && (BlockSizeLog >= 32)) throw runtime_error("please use a reasonable BlockSizeLog"); - load(data, length);// cheap constructor - } - + size_t storageInBytes() const { + return mainbuffer.size() * sizeof(uint8_t) + + highbuffer.size() * sizeof(higharraypair) + + sizeof(Length); // rough estimates (good enough) + } - ~Skipping() {} + uint32_t decompress(uint32_t *out) const { + const uint8_t *bout = mainbuffer.data(); + uint32_t pos = 0; - size_t storageInBytes() const { - return mainbuffer.size() * sizeof(uint8_t) - + highbuffer.size() * sizeof(higharraypair) - + sizeof(Length); // rough estimates (good enough) + uint32_t val = 0; + for (uint32_t k = 0; k < Length; ++k) { + bout = decode(bout, val); + out[pos++] = val; } + return pos; + } - uint32_t decompress(uint32_t *out) const { - const uint8_t *bout = mainbuffer.data(); - uint32_t pos = 0; - - uint32_t val = 0; - for (uint32_t k = 0; k < Length; ++k) { - bout = decode(bout, val); - out[pos++] = val; - } - return pos; - } - - - /** - * Intersects the current Skipping structure with a (small) uncompressed array and - * writes the answer to out. - */ - uint32_t intersect(const uint32_t *smallarray, uint32_t length, uint32_t *out) const { - uint32_t intersectsize = 0; - const uint8_t *largemainpointer = mainbuffer.data(); - uint32_t largemainval = 0; + /** + * Intersects the current Skipping structure with a (small) uncompressed array + * and + * writes the answer to out. + */ + uint32_t intersect(const uint32_t *smallarray, uint32_t length, + uint32_t *out) const { + uint32_t intersectsize = 0; + const uint8_t *largemainpointer = mainbuffer.data(); + uint32_t largemainval = 0; + largemainpointer = decode(largemainpointer, largemainval); + uint32_t x = 0; + for (uint32_t k = 0; k < length; ++k) { + uint32_t val = smallarray[k]; + // if the last value of the current block is too small, skip the block + // entirely + if (highbuffer[x >> BlockSizeLog].first < val) { + do { + x = ((x >> BlockSizeLog) + 1) << BlockSizeLog; + if (x >= Length) { + return intersectsize; + } + } while (highbuffer[x >> BlockSizeLog].first < val); + largemainpointer = + mainbuffer.data() + highbuffer[x >> BlockSizeLog].second; + largemainval = highbuffer[(x >> BlockSizeLog) - 1].first; largemainpointer = decode(largemainpointer, largemainval); - uint32_t x = 0; - for (uint32_t k = 0; k < length; ++k) { - uint32_t val = smallarray[k]; - // if the last value of the current block is too small, skip the block entirely - if (highbuffer[x >> BlockSizeLog].first < val) { - do { - x = ((x >> BlockSizeLog) + 1) << BlockSizeLog; - if (x >= Length) { - return intersectsize; - } - } while (highbuffer[x >> BlockSizeLog].first < val); - largemainpointer = mainbuffer.data() - + highbuffer[x >> BlockSizeLog].second; - largemainval = highbuffer[(x >> BlockSizeLog) - 1].first; - largemainpointer = decode(largemainpointer, largemainval); - } - // at this point, we have that the last value of the current block is >= val - // this means that we shall decode at most one block before giving up - while (largemainval < val) { - ++x; - if (x >= Length) { - return intersectsize; - } - largemainpointer = decode(largemainpointer, largemainval); - } - if (largemainval == val) { - out[intersectsize++] = val; - } + } + // at this point, we have that the last value of the current block is >= + // val + // this means that we shall decode at most one block before giving up + while (largemainval < val) { + ++x; + if (x >= Length) { + return intersectsize; } - return intersectsize; - + largemainpointer = decode(largemainpointer, largemainval); + } + if (largemainval == val) { + out[intersectsize++] = val; + } } - uint32_t intersect(const Skipping &otherlarger, uint32_t *out) const { - // we assume that "this" is the smallest of the two - if (otherlarger.Length < Length) - return otherlarger.intersect(*this, out); - if (Length == 0) - return 0;// special silly case - assert(otherlarger.Length >= Length); - assert(otherlarger.Length > 0); - uint32_t intersectsize = 0; + return intersectsize; + } + uint32_t intersect(const Skipping &otherlarger, uint32_t *out) const { + // we assume that "this" is the smallest of the two + if (otherlarger.Length < Length) + return otherlarger.intersect(*this, out); + if (Length == 0) + return 0; // special silly case + assert(otherlarger.Length >= Length); + assert(otherlarger.Length > 0); + uint32_t intersectsize = 0; - const uint8_t *inbyte = mainbuffer.data(); - const uint8_t *const endbyte = mainbuffer.data() - + mainbuffer.size(); - const uint8_t *largemainpointer = otherlarger.mainbuffer.data(); - uint32_t largemainval = 0; + const uint8_t *inbyte = mainbuffer.data(); + const uint8_t *const endbyte = mainbuffer.data() + mainbuffer.size(); + const uint8_t *largemainpointer = otherlarger.mainbuffer.data(); + uint32_t largemainval = 0; + largemainpointer = decode(largemainpointer, largemainval); + uint32_t val = 0; // where I put decoded values + uint32_t x = 0; + while (endbyte > inbyte) { + inbyte = decode(inbyte, val); + // if the last value of the current block is too small, skip the block + // entirely + if (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val) { + do { + x = ((x >> otherlarger.BlockSizeLog) + 1) << otherlarger.BlockSizeLog; + if (x >= otherlarger.Length) { + return intersectsize; + } + } while (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < + val); + largemainpointer = + otherlarger.mainbuffer.data() + + otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].second; + largemainval = + otherlarger.highbuffer[(x >> otherlarger.BlockSizeLog) - 1].first; largemainpointer = decode(largemainpointer, largemainval); - uint32_t val = 0;// where I put decoded values - uint32_t x = 0; - while (endbyte > inbyte) { - inbyte = decode(inbyte, val); - // if the last value of the current block is too small, skip the block entirely - if (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val) { - do { - x = ((x >> otherlarger.BlockSizeLog) + 1) << otherlarger.BlockSizeLog; - if (x >= otherlarger.Length) { - return intersectsize; - } - } while (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val); - largemainpointer = otherlarger.mainbuffer.data() - + otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].second; - largemainval = otherlarger.highbuffer[(x >> otherlarger.BlockSizeLog) - 1].first; - largemainpointer = decode(largemainpointer, largemainval); - } - // at this point, we have that the last value of the current block is >= val - // this means that we shall decode at most one block before giving up - while (largemainval < val) { - ++x; - if (x >= otherlarger.Length) { - return intersectsize; - } - largemainpointer = decode(largemainpointer, largemainval); - } - if (largemainval == val) { - out[intersectsize++] = val; - } + } + // at this point, we have that the last value of the current block is >= + // val + // this means that we shall decode at most one block before giving up + while (largemainval < val) { + ++x; + if (x >= otherlarger.Length) { + return intersectsize; } - return intersectsize; + largemainpointer = decode(largemainpointer, largemainval); + } + if (largemainval == val) { + out[intersectsize++] = val; + } } + return intersectsize; + } + + uint32_t BlockSizeLog; + vector mainbuffer; + typedef pair higharraypair; - uint32_t BlockSizeLog; - vector mainbuffer; - typedef pair higharraypair; + typedef vector higharray; + higharray highbuffer; + uint32_t Length; - typedef vector higharray; - higharray highbuffer; - uint32_t Length; + // please don't use the default constructor... - // please don't use the default constructor... + Skipping() : BlockSizeLog(0), mainbuffer(), highbuffer(), Length(0) {} - Skipping() : BlockSizeLog(0), - mainbuffer(), highbuffer(), Length(0) {} private: + Skipping(const Skipping &); - Skipping(const Skipping &); + // making it private on purpose + Skipping &operator=(const Skipping &); - // making it private on purpose - Skipping &operator=(const Skipping &); + void load(const uint32_t *data, uint32_t length); - void load(const uint32_t *data, uint32_t length); + template uint8_t extract7bits(const uint32_t val) { + return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); + } - template - uint8_t extract7bits(const uint32_t val) { - return static_cast((val >>(7 * i)) & ((1U << 7) - 1)); + template uint8_t extract7bitsmaskless(const uint32_t val) { + return static_cast((val >> (7 * i))); + } + static inline const uint8_t *decode(const uint8_t *buffer, uint32_t &prev) { + // manually unrolled for performance + uint32_t v = 0; + uint8_t c = *buffer++; + v += (c & 127); + if ((c & 128)) { + prev += v; + return buffer; } - - template - uint8_t extract7bitsmaskless(const uint32_t val) { - return static_cast((val >>(7 * i))); + c = *buffer++; + v += ((c & 127) << 7); + if ((c & 128)) { + prev += v; + return buffer; } - static inline const uint8_t *decode(const uint8_t *buffer, uint32_t &prev) { - // manually unrolled for performance - uint32_t v = 0; - uint8_t c = *buffer++; - v += (c & 127) ; - if ((c & 128)) { - prev += v; - return buffer; - } - c = *buffer++; - v += ((c & 127) << 7); - if ((c & 128)) { - prev += v; - return buffer; - } - c = *buffer++; - v += ((c & 127) << 14); - if ((c & 128)) { - prev += v; - return buffer; - } - c = *buffer++; - v += ((c & 127) << 21); - if ((c & 128)) { - prev += v; - return buffer; - } - c = *buffer++; - v += ((c & 127) << 30); - prev += v; - return buffer; + c = *buffer++; + v += ((c & 127) << 14); + if ((c & 128)) { + prev += v; + return buffer; } + c = *buffer++; + v += ((c & 127) << 21); + if ((c & 128)) { + prev += v; + return buffer; + } + c = *buffer++; + v += ((c & 127) << 30); + prev += v; + return buffer; + } }; -void Skipping::load(const uint32_t *data, uint32_t len) { - assert(numeric_limits::max() < (numeric_limits::max() / 5));// check for overflow - Length = len; - if (Length == 0) return; // nothing to do - uint32_t BlockNumber = (Length + (1 << BlockSizeLog) - 1) / (1 << BlockSizeLog); // count full blocks - assert(BlockNumber << BlockSizeLog >= Length); - highbuffer.resize(BlockNumber); - mainbuffer.resize(5 * Length); - uint8_t *bout = mainbuffer.data(); - uint8_t *const boutinit = bout; - uint32_t prev = 0; - for (uint32_t k = 0; k < BlockNumber; ++k) { - const uint32_t howmany = (((k + 1) << BlockSizeLog) > Length) ? - Length - (k << BlockSizeLog) +void Skipping::load(const uint32_t *data, uint32_t len) { + assert(numeric_limits::max() < + (numeric_limits::max() / 5)); // check for overflow + Length = len; + if (Length == 0) + return; // nothing to do + uint32_t BlockNumber = (Length + (1 << BlockSizeLog) - 1) / + (1 << BlockSizeLog); // count full blocks + assert(BlockNumber << BlockSizeLog >= Length); + highbuffer.resize(BlockNumber); + mainbuffer.resize(5 * Length); + uint8_t *bout = mainbuffer.data(); + uint8_t *const boutinit = bout; + uint32_t prev = 0; + for (uint32_t k = 0; k < BlockNumber; ++k) { + const uint32_t howmany = (((k + 1) << BlockSizeLog) > Length) + ? Length - (k << BlockSizeLog) : 1 << BlockSizeLog; - highbuffer[k] = make_pair(data[(k << BlockSizeLog) + howmany - 1], - static_cast(bout - boutinit)); - for (uint32_t x = 0; x < howmany; ++x) { - const uint32_t v = data[x + (k << BlockSizeLog)]; - const uint32_t val = v - prev; - prev = v; - if (val < (1U << 7)) { - *bout = static_cast(val | (1U << 7)); - ++bout; - } else if (val < (1U << 14)) { - *bout = extract7bits<0> (val); - ++bout; - *bout = extract7bitsmaskless<1> (val) | (1U << 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = extract7bits<0> (val); - ++bout; - *bout = extract7bits<1> (val); - ++bout; - *bout = extract7bitsmaskless<2> (val) | (1U << 7); - ++bout; - } else if (val < (1U << 28)) { - *bout = extract7bits<0> (val); - ++bout; - *bout = extract7bits<1> (val); - ++bout; - *bout = extract7bits<2> (val); - ++bout; - *bout = extract7bitsmaskless<3> (val) | (1U << 7); - ++bout; - } else { - *bout = extract7bits<0> (val); - ++bout; - *bout = extract7bits<1> (val); - ++bout; - *bout = extract7bits<2> (val); - ++bout; - *bout = extract7bits<3> (val); - ++bout; - *bout = extract7bitsmaskless<4> (val) | (1U << 7); - ++bout; - } - } + highbuffer[k] = make_pair(data[(k << BlockSizeLog) + howmany - 1], + static_cast(bout - boutinit)); + for (uint32_t x = 0; x < howmany; ++x) { + const uint32_t v = data[x + (k << BlockSizeLog)]; + const uint32_t val = v - prev; + prev = v; + if (val < (1U << 7)) { + *bout = static_cast(val | (1U << 7)); + ++bout; + } else if (val < (1U << 14)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bitsmaskless<1>(val) | (1U << 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bitsmaskless<2>(val) | (1U << 7); + ++bout; + } else if (val < (1U << 28)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bits<2>(val); + ++bout; + *bout = extract7bitsmaskless<3>(val) | (1U << 7); + ++bout; + } else { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bits<2>(val); + ++bout; + *bout = extract7bits<3>(val); + ++bout; + *bout = extract7bitsmaskless<4>(val) | (1U << 7); + ++bout; + } } - mainbuffer.resize(static_cast(bout - boutinit)); - mainbuffer.shrink_to_fit(); + } + mainbuffer.resize(static_cast(bout - boutinit)); + mainbuffer.shrink_to_fit(); } } // namespace SIMDCompressionLib diff --git a/include/sortedbitpacking.h b/include/sortedbitpacking.h index a0f44aa..2d00e8b 100644 --- a/include/sortedbitpacking.h +++ b/include/sortedbitpacking.h @@ -8,19 +8,15 @@ #ifndef SIMDCompressionAndIntersection_SORTEDBITPACKING_H_ #define SIMDCompressionAndIntersection_SORTEDBITPACKING_H_ - #include "common.h" #include "simdbitpacking.h" #include "bitpackinghelpers.h" namespace SIMDCompressionLib { - -template -CONST_FUNCTION -static T *padTo128bits(T *inbyte) { - return reinterpret_cast((reinterpret_cast(inbyte) - + 15) & ~15); +template CONST_FUNCTION static T *padTo128bits(T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 15) & + ~15); } /** @@ -32,191 +28,173 @@ static T *padTo128bits(T *inbyte) { */ class BasicSortedBitPacker { public: + enum { DEFAULTSIZE = 128 }; // should be a multiple of 128 + uint32_t buffer[32]; - enum {DEFAULTSIZE = 128}; // should be a multiple of 128 - uint32_t buffer[32]; - - static string name() { - return "uBSBP"; - } + static string name() { return "uBSBP"; } - BasicSortedBitPacker() { - for (uint32_t i = 0; i < 32; ++i) { - data[i] = new uint32_t[DEFAULTSIZE]; - memset(data[i], 0, DEFAULTSIZE * sizeof(uint32_t)); - actualsizes[i] = DEFAULTSIZE; - } - clear(); + BasicSortedBitPacker() { + for (uint32_t i = 0; i < 32; ++i) { + data[i] = new uint32_t[DEFAULTSIZE]; + memset(data[i], 0, DEFAULTSIZE * sizeof(uint32_t)); + actualsizes[i] = DEFAULTSIZE; } - - void reset() { - for (uint32_t i = 0; i < 32; ++i) { - delete[] data[i]; - data[i] = new uint32_t[DEFAULTSIZE]; - memset(data[i], 0, DEFAULTSIZE * sizeof(uint32_t)); - actualsizes[i] = DEFAULTSIZE; - } - clear(); + clear(); + } + + void reset() { + for (uint32_t i = 0; i < 32; ++i) { + delete[] data[i]; + data[i] = new uint32_t[DEFAULTSIZE]; + memset(data[i], 0, DEFAULTSIZE * sizeof(uint32_t)); + actualsizes[i] = DEFAULTSIZE; } - - ~BasicSortedBitPacker() { - free(); + clear(); + } + + ~BasicSortedBitPacker() { free(); } + void free() { + clear(); + for (uint32_t i = 0; i < 32; ++i) + if (data[i] != NULL) { + delete[] data[i]; + data[i] = NULL; + actualsizes[i] = 0; + } + } + void directAppend(uint32_t i, uint32_t val) { data[i][sizes[i]++] = val; } + + const uint32_t *get(int i) { return data[i]; } + + void ensureCapacity(int i, uint32_t datatoadd) { + if (sizes[i] + datatoadd > actualsizes[i]) { + actualsizes[i] = (sizes[i] + datatoadd + 127) / 128 * 128 * + 2; // so we always get a multiple of 128 + uint32_t *tmp = new uint32_t[actualsizes[i]]; + for (uint32_t j = 0; j < sizes[i]; ++j) + tmp[j] = data[i][j]; + delete[] data[i]; + data[i] = tmp; } - void free() { - clear(); - for (uint32_t i = 0; i < 32; ++i) - if (data[i] != NULL) { - delete[] data[i]; - data[i] = NULL; - actualsizes[i] = 0; - } + } + + void clear() { + for (uint32_t i = 0; i < 32; ++i) + sizes[i] = 0; // memset "might" be faster. + } + + uint32_t *write(uint32_t *out) { + uint32_t bitmap = 0; + for (uint32_t k = 1; k < 32; ++k) { + if (sizes[k] != 0) + bitmap |= (1U << k); } - void directAppend(uint32_t i, uint32_t val) { - data[i][sizes[i]++] = val; - } - - const uint32_t *get(int i) { - return data[i]; - } - - void ensureCapacity(int i, uint32_t datatoadd) { - if (sizes[i] + datatoadd > actualsizes[i]) { - actualsizes[i] = (sizes[i] + datatoadd + 127) / 128 * 128 * 2; // so we always get a multiple of 128 - uint32_t *tmp = new uint32_t[actualsizes[i]]; - for (uint32_t j = 0; j < sizes[i]; ++j) - tmp[j] = data[i][j]; - delete[] data[i]; - data[i] = tmp; + *(out++) = bitmap; + + for (uint32_t k = 1; k < 32; ++k) { + if (sizes[k] != 0) { + *out = sizes[k]; + out++; + uint32_t j = 0; + for (; j + 128 <= sizes[k]; j += 128) { + usimdpackwithoutmask(&data[k][j], reinterpret_cast<__m128i *>(out), + k + 1); + out += 4 * (k + 1); } + // falling back on scalar + for (; j < sizes[k]; j += 32) { + BitPackingHelpers::fastpackwithoutmask(&data[k][j], out, k + 1); + out += k + 1; + } + out -= (j - sizes[k]) * (k + 1) / 32; + } } - - void clear() { - for (uint32_t i = 0; i < 32; ++i) - sizes[i] = 0;// memset "might" be faster. - } - - - uint32_t *write(uint32_t *out) { - uint32_t bitmap = 0; - for (uint32_t k = 1; k < 32; ++k) { - if (sizes[k] != 0) - bitmap |= (1U << k); + return out; + } + const uint32_t *read(const uint32_t *in) { + clear(); + const uint32_t bitmap = *(in++); + + for (uint32_t k = 1; k < 32; ++k) { + if ((bitmap & (1U << k)) != 0) { + sizes[k] = *in++; + if (actualsizes[k] < sizes[k]) { + delete[] data[k]; + actualsizes[k] = (sizes[k] + 127) / 128 * 128; + data[k] = new uint32_t[actualsizes[k]]; } - *(out++) = bitmap; - - for (uint32_t k = 1; k < 32; ++k) { - if (sizes[k] != 0) { - *out = sizes[k]; - out++; - uint32_t j = 0; - for (; j + 128 <= sizes[k]; j += 128) { - usimdpackwithoutmask(&data[k][j], reinterpret_cast<__m128i *>(out), k + 1); - out += 4 * (k + 1); - } - // falling back on scalar - for (; j < sizes[k]; j += 32) { - BitPackingHelpers::fastpackwithoutmask(&data[k][j], out, k + 1); - out += k + 1; - } - out -= ( j - sizes[k] ) * (k + 1) / 32; - + uint32_t j = 0; + for (; j + 128 <= sizes[k]; j += 128) { - - - } + usimdunpack(reinterpret_cast(in), &data[k][j], + k + 1); + in += 4 * (k + 1); } - return out; - } - const uint32_t *read(const uint32_t *in) { - clear(); - const uint32_t bitmap = *(in++); - - for (uint32_t k = 1; k < 32; ++k) { - if ((bitmap & (1U << k)) != 0) { - sizes[k] = *in++; - if (actualsizes[k] < sizes[k]) { - delete[] data[k]; - actualsizes[k] = (sizes[k] + 127) / 128 * 128; - data[k] = new uint32_t[actualsizes[k]]; - } - uint32_t j = 0; - for (; j + 128 <= sizes[k] ; j += 128) { - - usimdunpack(reinterpret_cast(in), &data[k][j], k + 1); - in += 4 * (k + 1); - - } - for (; j + 31 < sizes[k]; j += 32) { - BitPackingHelpers::fastunpack(in, &data[k][j], k + 1); - in += k + 1; - } - uint32_t remaining = sizes[k] - j; - memcpy(buffer,in,(remaining * (k + 1) + 31)/32*sizeof(uint32_t)); - uint32_t * bpointer = buffer; - in += ((sizes[k]+31)/32*32-j)/32 * (k+1); - for(; j < sizes[k]; j += 32) { - BitPackingHelpers::fastunpack(bpointer, &data[k][j], k + 1); - bpointer += k + 1; - } - in -= ( j - sizes[k] ) * (k + 1) / 32; - - } + for (; j + 31 < sizes[k]; j += 32) { + BitPackingHelpers::fastunpack(in, &data[k][j], k + 1); + in += k + 1; } - return in; - + uint32_t remaining = sizes[k] - j; + memcpy(buffer, in, (remaining * (k + 1) + 31) / 32 * sizeof(uint32_t)); + uint32_t *bpointer = buffer; + in += ((sizes[k] + 31) / 32 * 32 - j) / 32 * (k + 1); + for (; j < sizes[k]; j += 32) { + BitPackingHelpers::fastunpack(bpointer, &data[k][j], k + 1); + bpointer += k + 1; + } + in -= (j - sizes[k]) * (k + 1) / 32; + } } + return in; + } + + // for debugging + void sanityCheck() { + for (uint32_t k = 0; k < 32; ++k) { + if (sizes[k] > actualsizes[k]) { + cerr << "overflow at " << k << endl; + throw runtime_error("bug"); + } + if (sizes[k] != 0) { + cout << "k=" << k << endl; + uint32_t mask = 0u; + for (uint32_t j = 0; j < sizes[k]; ++j) { + cout << data[k][j] << " "; + mask |= data[k][j]; + } + cout << endl; - // for debugging - void sanityCheck() { - for (uint32_t k = 0; k < 32; ++k) { - if (sizes[k] > actualsizes[k]) { - cerr << "overflow at " << k << endl; - throw runtime_error("bug"); - } - if (sizes[k] != 0) { - cout << "k=" << k << endl; - uint32_t mask = 0u; - for (uint32_t j = 0; j < sizes[k]; ++j) { - cout << data[k][j] << " "; - mask |= data[k][j]; - } - cout << endl; - - if (gccbits(mask) > k + 1) { - cerr << "At " << (k + 1) << " we have " << gccbits(mask) << endl; - throw runtime_error("bug"); - } - } + if (gccbits(mask) > k + 1) { + cerr << "At " << (k + 1) << " we have " << gccbits(mask) << endl; + throw runtime_error("bug"); } + } } - - bool equals(const BasicSortedBitPacker &o) { - for (uint32_t k = 0; k < 32; ++k) { - if (sizes[k] != o.sizes[k]) { - return false; - } - for (uint32_t j = 0; j < sizes[k]; ++j) - if (data[k][j] != o.data[k][j]) { - return false; - } + } + + bool equals(const BasicSortedBitPacker &o) { + for (uint32_t k = 0; k < 32; ++k) { + if (sizes[k] != o.sizes[k]) { + return false; + } + for (uint32_t j = 0; j < sizes[k]; ++j) + if (data[k][j] != o.data[k][j]) { + return false; } - return true; } - + return true; + } private: - uint32_t *data[32]; - uint32_t sizes[32]; - uint32_t actualsizes[32]; - - // we don't want anyone to start copying this class - BasicSortedBitPacker(const BasicSortedBitPacker &); - BasicSortedBitPacker &operator=(const BasicSortedBitPacker &); - + uint32_t *data[32]; + uint32_t sizes[32]; + uint32_t actualsizes[32]; + // we don't want anyone to start copying this class + BasicSortedBitPacker(const BasicSortedBitPacker &); + BasicSortedBitPacker &operator=(const BasicSortedBitPacker &); }; - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_SORTEDBITPACKING_H_ */ diff --git a/include/streamvariablebyte.h b/include/streamvariablebyte.h index 9390f95..d575688 100644 --- a/include/streamvariablebyte.h +++ b/include/streamvariablebyte.h @@ -6,64 +6,65 @@ namespace SIMDCompressionLib { - /** * StreamVByte is an integer CODEC invented by Nathan Kurz. */ - - extern "C" { -uint64_t svb_encode(uint8_t *out, const uint32_t *in, uint32_t count, - int delta, int type); -uint8_t *svb_decode_avx_simple(uint32_t *out, uint8_t * keyPtr, uint8_t * dataPtr, uint64_t count); -uint8_t *svb_decode_avx_d1_simple(uint32_t *out, uint8_t * keyPtr, uint8_t * dataPtr, uint64_t count); +uint64_t svb_encode(uint8_t *out, const uint32_t *in, uint32_t count, int delta, + int type); +uint8_t *svb_decode_avx_simple(uint32_t *out, uint8_t *keyPtr, uint8_t *dataPtr, + uint64_t count); +uint8_t *svb_decode_avx_d1_simple(uint32_t *out, uint8_t *keyPtr, + uint8_t *dataPtr, uint64_t count); uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, const uint8_t *keyPtr, - uint8_t *dataPtr, uint32_t count, uint32_t prev); + uint8_t *dataPtr, uint32_t count, + uint32_t prev); uint32_t svb_select_avx_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, - uint64_t count, uint32_t prev, int slot); + uint64_t count, uint32_t prev, int slot); int svb_find_avx_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, uint64_t count, uint32_t prev, uint32_t key, uint32_t *presult); uint8_t *svb_insert_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, - size_t dataSize, uint32_t count, - uint32_t prev, uint32_t new_key, - uint32_t *position); + size_t dataSize, uint32_t count, + uint32_t prev, uint32_t new_key, + uint32_t *position); uint8_t *svb_append_scalar_d1(uint8_t *keyPtr, uint8_t *dataPtr, - size_t sizebytes, size_t count, uint32_t delta); + size_t sizebytes, size_t count, uint32_t delta); } - /** * Regular StreamVByte (no differential coding) */ class StreamVByte : public IntegerCODEC { public: - void encodeArray(uint32_t *in, const size_t count, uint32_t *out, - size_t &nvalue) { - uint64_t bytesWritten = svb_encode((uint8_t *)out, in, static_cast(std::min(count, std::numeric_limits::max())), 0, 1); - nvalue = static_cast(bytesWritten + 3)/4; - } - - const uint32_t * decodeArray(const uint32_t *in, const size_t /* count */, - uint32_t *out, size_t & nvalue) { - uint32_t count = *(uint32_t *)in; // first 4 bytes is number of ints - nvalue = count; - if (count == 0) return 0; - - uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next - uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) - uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys - nvalue = count; - return reinterpret_cast((reinterpret_cast( - svb_decode_avx_simple(out, keyPtr, dataPtr, count)) - + 3) & ~3); - - } - - std::string name() const { - return "streamvbyte"; - } - + void encodeArray(uint32_t *in, const size_t count, uint32_t *out, + size_t &nvalue) { + uint64_t bytesWritten = svb_encode( + (uint8_t *)out, in, static_cast(std::min( + count, std::numeric_limits::max())), + 0, 1); + nvalue = static_cast(bytesWritten + 3) / 4; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t /* count */, + uint32_t *out, size_t &nvalue) { + uint32_t count = *(uint32_t *)in; // first 4 bytes is number of ints + nvalue = count; + if (count == 0) + return 0; + + uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + nvalue = count; + return reinterpret_cast( + (reinterpret_cast( + svb_decode_avx_simple(out, keyPtr, dataPtr, count)) + + 3) & + ~3); + } + + std::string name() const { return "streamvbyte"; } }; /** @@ -71,170 +72,176 @@ class StreamVByte : public IntegerCODEC { */ class StreamVByteD1 : public IntegerCODEC { public: - - void encodeArray(uint32_t *in, const size_t count, uint32_t *out, - size_t &nvalue) { - uint32_t bytesWritten = static_cast(svb_encode((uint8_t *)(out + 1), in, - static_cast(std::min(count, std::numeric_limits::max())), 1, 1)); - *out = 4 + bytesWritten; - nvalue = 1 + (bytesWritten + 3) / 4; - } - - void encodeToByteArray(uint32_t *in, const size_t count, uint8_t *out, - size_t &nvalue) { - uint32_t bytesWritten = static_cast(svb_encode((uint8_t *)(out + 1), in, - static_cast(std::min(count, std::numeric_limits::max())), 1, 1)); - *out = 4 + bytesWritten; - nvalue = 4 + bytesWritten; - } - - const uint32_t * decodeArray(const uint32_t *in, const size_t /* count */, - uint32_t *out, size_t & nvalue) { - ++in; // number of encoded bytes - uint32_t count = *(uint32_t *)in; // next 4 bytes is number of ints - nvalue = count; - if (count == 0) return 0; - - uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next - uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) - uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys - return reinterpret_cast((reinterpret_cast( - svb_decode_avx_d1_simple(out, keyPtr, dataPtr, count)) - + 3) & ~3); - } - - const uint8_t * decodeFromByteArray(const uint8_t *in, const size_t /* count */, - uint32_t *out, size_t & nvalue) { - in += 4; // number of encoded bytes - uint32_t count = *(uint32_t *)in; // next 4 bytes is number of ints - nvalue = count; - if (count == 0) return 0; - - uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next - uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) - uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys - return svb_decode_avx_d1_simple(out, keyPtr, dataPtr, count); - } - - uint32_t select(const uint32_t *in, int slot) { - ++in; // number of encoded bytes - uint32_t count = *in; // next 4 bytes is number of ints - assert(slot < (int)count); - uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next - uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) - uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys - return svb_select_avx_d1_init(keyPtr, dataPtr, count, 0, slot); + void encodeArray(uint32_t *in, const size_t count, uint32_t *out, + size_t &nvalue) { + uint32_t bytesWritten = static_cast( + svb_encode((uint8_t *)(out + 1), in, + static_cast(std::min( + count, std::numeric_limits::max())), + 1, 1)); + *out = 4 + bytesWritten; + nvalue = 1 + (bytesWritten + 3) / 4; + } + + void encodeToByteArray(uint32_t *in, const size_t count, uint8_t *out, + size_t &nvalue) { + uint32_t bytesWritten = static_cast( + svb_encode((uint8_t *)(out + 1), in, + static_cast(std::min( + count, std::numeric_limits::max())), + 1, 1)); + *out = 4 + bytesWritten; + nvalue = 4 + bytesWritten; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t /* count */, + uint32_t *out, size_t &nvalue) { + ++in; // number of encoded bytes + uint32_t count = *(uint32_t *)in; // next 4 bytes is number of ints + nvalue = count; + if (count == 0) + return 0; + + uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + return reinterpret_cast( + (reinterpret_cast( + svb_decode_avx_d1_simple(out, keyPtr, dataPtr, count)) + + 3) & + ~3); + } + + const uint8_t *decodeFromByteArray(const uint8_t *in, + const size_t /* count */, uint32_t *out, + size_t &nvalue) { + in += 4; // number of encoded bytes + uint32_t count = *(uint32_t *)in; // next 4 bytes is number of ints + nvalue = count; + if (count == 0) + return 0; + + uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + return svb_decode_avx_d1_simple(out, keyPtr, dataPtr, count); + } + + uint32_t select(const uint32_t *in, int slot) { + ++in; // number of encoded bytes + uint32_t count = *in; // next 4 bytes is number of ints + assert(slot < (int)count); + uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + return svb_select_avx_d1_init(keyPtr, dataPtr, count, 0, slot); + } + + uint32_t findLowerBound(const uint32_t *in, uint32_t /* count */, + uint32_t key, uint32_t *presult) { + ++in; // skip number of encoded bytes + uint32_t count = *(uint32_t *)in; // next 4 bytes is number of ints + uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + return (uint32_t)svb_find_avx_d1_init(keyPtr, dataPtr, count, 0, key, + presult); + } + + // append a key. Keys must be in sorted order. We assume that there is + // enough room and that delta encoding was used. + // Returns the new size of the compressed array *in bytes* + size_t appendToByteArray(uint8_t *in, const size_t /* length */, + uint32_t previous_key, uint32_t key) { + uint8_t *initin = in; + size_t size = *(uint32_t *)in; + in += 4; + size_t count = *(uint32_t *)in; + in += 4; + + // if the buffer is not yet initialized: pretend that the first 8 + // bytes are already occupied + if (size == 0) + size = 8; + + uint8_t *keyPtr = (uint8_t *)in; // full list of keys is next + uint32_t keyLen = + static_cast((count + 3) / 4); // 2-bits per key (rounded up) + uint8_t *dataPtr = keyPtr + keyLen; // data starts after the keys + size = svb_append_scalar_d1(keyPtr, dataPtr, size - 8, count, + key - previous_key) - + initin; + + // update 'size' and 'count' at the beginning of the buffer + in = initin; + *(uint32_t *)in = static_cast(size); + in += 4; + *(uint32_t *)in = static_cast(count + 1); + return size; + } + + // Inserts |key| into an encoded sequence. |encodedSize| is the total + // allocated size for |in| (in bytes). + // Returns the number of values written. + uint32_t insert(uint32_t *in, uint32_t, uint32_t key) { + uint32_t bytesEncoded = *in; + uint32_t count = *(in + 1); // first 4 bytes is number of ints + uint8_t *keyPtr = (uint8_t *)(in + 2); // full list of keys is next + // keyLen: 2-bits per key (rounded up), but at least 1 byte + uint32_t keyLen = count == 0 ? 1 : ((count + 3) / 4); + uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + uint32_t dataSize = (bytesEncoded - 8) - keyLen; + + // make space for the new key? + if (count > 0 && count % 4 == 0 && keyPtr + keyLen + 1 > dataPtr) { + memmove(dataPtr + 1, dataPtr, dataSize); + dataPtr++; } - uint32_t findLowerBound(const uint32_t *in, uint32_t /* count */, - uint32_t key, uint32_t *presult) { - ++in; // skip number of encoded bytes - uint32_t count = *(uint32_t *)in; // next 4 bytes is number of ints - uint8_t *keyPtr = (uint8_t *)in + 4; // full list of keys is next - uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) - uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys - return (uint32_t)svb_find_avx_d1_init(keyPtr, dataPtr, count, - 0, key, presult); + *(in + 1) = count + 1; + + uint32_t position; + uint32_t bytesWritten = static_cast( + svb_insert_scalar_d1_init(keyPtr, dataPtr, dataSize, count, 0, key, + &position) - + (uint8_t *)in); + *in = bytesWritten; + return (bytesWritten + 3) / 4; + } + + // Inserts |key| into an encoded sequence. |encodedSize| is the total + // allocated size for |in| (in bytes). + // Returns the number of *bytes* written. + size_t insertInByteArray(uint8_t *inbyte, uint32_t, uint32_t key) { + uint32_t *in = (uint32_t *)inbyte; + uint32_t bytesEncoded = *in; + uint32_t count = *(in + 1); // first 4 bytes is number of ints + uint8_t *keyPtr = (uint8_t *)(in + 2); // full list of keys is next + // keyLen: 2-bits per key (rounded up), but at least 1 byte + uint32_t keyLen = count == 0 ? 1 : ((count + 3) / 4); + uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + uint32_t dataSize = (bytesEncoded - 8) - keyLen; + + // make space for the new key? + if (count > 0 && count % 4 == 0 && keyPtr + keyLen + 1 > dataPtr) { + memmove(dataPtr + 1, dataPtr, dataSize); + dataPtr++; } - // append a key. Keys must be in sorted order. We assume that there is - // enough room and that delta encoding was used. - // Returns the new size of the compressed array *in bytes* - size_t appendToByteArray(uint8_t *in, const size_t /* length */, uint32_t previous_key, - uint32_t key) { - uint8_t *initin = in; - size_t size = *(uint32_t *)in; - in += 4; - size_t count = *(uint32_t *)in; - in += 4; - - // if the buffer is not yet initialized: pretend that the first 8 - // bytes are already occupied - if (size == 0) - size = 8; - - uint8_t *keyPtr = (uint8_t *)in; // full list of keys is next - uint32_t keyLen = static_cast((count + 3) / 4); // 2-bits per key (rounded up) - uint8_t *dataPtr = keyPtr + keyLen; // data starts after the keys - size = svb_append_scalar_d1(keyPtr, dataPtr, size - 8, count, - key - previous_key) - initin; - - // update 'size' and 'count' at the beginning of the buffer - in = initin; - *(uint32_t *)in = static_cast(size); - in += 4; - *(uint32_t *)in = static_cast(count + 1); - return size; - } - - // Inserts |key| into an encoded sequence. |encodedSize| is the total - // allocated size for |in| (in bytes). - // Returns the number of values written. - uint32_t insert(uint32_t *in, uint32_t, uint32_t key) { - uint32_t bytesEncoded = *in; - uint32_t count = *(in + 1); // first 4 bytes is number of ints - uint8_t *keyPtr = (uint8_t *)(in + 2); // full list of keys is next - // keyLen: 2-bits per key (rounded up), but at least 1 byte - uint32_t keyLen = count == 0 ? 1 : ((count + 3) / 4); - uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys - uint32_t dataSize = (bytesEncoded - 8) - keyLen; - - // make space for the new key? - if (count > 0 - && count % 4 == 0 - && keyPtr + keyLen + 1 > dataPtr) { - memmove(dataPtr + 1, dataPtr, dataSize); - dataPtr++; - } - - *(in + 1) = count + 1; - - uint32_t position; - uint32_t bytesWritten = static_cast(svb_insert_scalar_d1_init(keyPtr, dataPtr, - dataSize, count, 0, key, &position) - - (uint8_t *)in); - *in = bytesWritten; - return (bytesWritten + 3) / 4; - } + *(in + 1) = count + 1; - // Inserts |key| into an encoded sequence. |encodedSize| is the total - // allocated size for |in| (in bytes). - // Returns the number of *bytes* written. - size_t insertInByteArray(uint8_t *inbyte, uint32_t, uint32_t key) { - uint32_t * in = (uint32_t *) inbyte; - uint32_t bytesEncoded = *in; - uint32_t count = *(in + 1); // first 4 bytes is number of ints - uint8_t *keyPtr = (uint8_t *)(in + 2); // full list of keys is next - // keyLen: 2-bits per key (rounded up), but at least 1 byte - uint32_t keyLen = count == 0 ? 1 : ((count + 3) / 4); - uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys - uint32_t dataSize = (bytesEncoded - 8) - keyLen; - - // make space for the new key? - if (count > 0 - && count % 4 == 0 - && keyPtr + keyLen + 1 > dataPtr) { - memmove(dataPtr + 1, dataPtr, dataSize); - dataPtr++; - } - - *(in + 1) = count + 1; - - uint32_t position; - uint32_t bytesWritten = static_cast(svb_insert_scalar_d1_init(keyPtr, dataPtr, - dataSize, count, 0, key, &position) - - (uint8_t *)in); - *in = bytesWritten; - return bytesWritten; - } - - std::string name() const { - return "streamvbyte_d1"; - } + uint32_t position; + uint32_t bytesWritten = static_cast( + svb_insert_scalar_d1_init(keyPtr, dataPtr, dataSize, count, 0, key, + &position) - + (uint8_t *)in); + *in = bytesWritten; + return bytesWritten; + } + std::string name() const { return "streamvbyte_d1"; } }; } // namespace SIMDCompressionLib - #endif // STREAMVARIABLEBYTE_ diff --git a/include/synthetic.h b/include/synthetic.h index 44f17cf..2f0a7a5 100644 --- a/include/synthetic.h +++ b/include/synthetic.h @@ -8,7 +8,6 @@ #ifndef SIMDCompressionAndIntersection_SYNTHETIC_H_ #define SIMDCompressionAndIntersection_SYNTHETIC_H_ - #include "common.h" #include "util.h" #include "mersenne.h" @@ -19,104 +18,103 @@ namespace SIMDCompressionLib { using namespace std; - - vector generateArray(uint32_t N, const uint32_t mask = 0xFFFFFFFFU) { - vector ans(N); - for (size_t k = 0; k < N; ++k) - ans[k] = rand() & mask; - return ans; + vector ans(N); + for (size_t k = 0; k < N; ++k) + ans[k] = rand() & mask; + return ans; } -vector generateArray32(uint32_t N, const uint32_t mask = 0xFFFFFFFFU) { - vector ans(N); - for (size_t k = 0; k < N; ++k) - ans[k] = rand() & mask; - return ans; +vector generateArray32(uint32_t N, + const uint32_t mask = 0xFFFFFFFFU) { + vector ans(N); + for (size_t k = 0; k < N; ++k) + ans[k] = rand() & mask; + return ans; } - class UniformDataGenerator { public: - UniformDataGenerator(uint32_t seed = static_cast(time(NULL))) : - rand(seed) { + UniformDataGenerator(uint32_t seed = static_cast(time(NULL))) + : rand(seed) {} + + void negate(vector &in, vector &out, uint32_t Max) { + out.resize(Max - in.size()); + in.push_back(Max); + uint32_t i = 0; + size_t c = 0; + for (size_t j = 0; j < in.size(); ++j) { + const uint32_t v = in[j]; + for (; i < v; ++i) + out[c++] = i; + ++i; } - - void negate(vector &in , vector &out, uint32_t Max) { - out.resize(Max - in.size()); - in.push_back(Max); - uint32_t i = 0; - size_t c = 0; - for (size_t j = 0; j < in.size() ; ++j) { - const uint32_t v = in[j]; - for (; i < v; ++i) - out[c++] = i; - ++i; - } - assert(c == out.size()); + assert(c == out.size()); + } + + /** + * fill the vector with N numbers uniformly picked from from 0 to Max, not + * including Max + * if it is not possible, an exception is thrown + */ + vector generateUniformHash(uint32_t N, uint32_t Max, + vector &ans) { + if (Max < N) + throw runtime_error( + "can't generate enough distinct elements in small interval"); + ans.clear(); + if (N == 0) + return ans; // nothing to do + ans.reserve(N); + assert(Max >= 1); + unordered_set s; + while (s.size() < N) + s.insert(rand.getValue(Max - 1)); + ans.assign(s.begin(), s.end()); + sort(ans.begin(), ans.end()); + assert(N == ans.size()); + return ans; + } + + void generateUniformBitmap(uint32_t N, uint32_t Max, vector &ans) { + if (Max < N) + throw runtime_error( + "can't generate enough distinct elements in small interval"); + assert(Max >= 1); + BoolArray bs(Max); + uint32_t card = 0; + while (card < N) { + uint32_t v = rand.getValue(Max - 1); + if (!bs.get(v)) { + bs.set(v); + ++card; + } } - - /** - * fill the vector with N numbers uniformly picked from from 0 to Max, not including Max - * if it is not possible, an exception is thrown - */ - vector generateUniformHash(uint32_t N, uint32_t Max, vector &ans) { - if (Max < N) throw runtime_error("can't generate enough distinct elements in small interval"); - ans.clear(); - if (N == 0) return ans; // nothing to do - ans.reserve(N); - assert(Max >= 1); - unordered_set s; - while (s.size() < N) - s.insert(rand.getValue(Max - 1)); - ans.assign(s.begin(), s.end()); - sort(ans.begin(), ans.end()); - assert(N == ans.size()); - return ans; + ans.resize(N); + bs.toArray(ans); + } + + void fastgenerateUniform(uint32_t N, uint32_t Max, vector &ans) { + if (2 * N > Max) { + vector buf(N); + fastgenerateUniform(Max - N, Max, buf); + negate(buf, ans, Max); + return; } - - - void generateUniformBitmap(uint32_t N, uint32_t Max, vector &ans) { - if (Max < N) throw runtime_error("can't generate enough distinct elements in small interval"); - assert(Max >= 1); - BoolArray bs(Max); - uint32_t card = 0; - while (card < N) { - uint32_t v = rand.getValue(Max - 1) ; - if (!bs.get(v)) { - bs.set(v); - ++card; - } - } - ans.resize(N); - bs.toArray(ans); + if (N * 1024 > Max) { + generateUniformBitmap(N, Max, ans); } - - - void fastgenerateUniform(uint32_t N, uint32_t Max, vector &ans) { - if (2 * N > Max) { - vector buf(N); - fastgenerateUniform(Max - N, Max, buf); - negate(buf, ans, Max); - return; - } - if (N * 1024 > Max) { - generateUniformBitmap(N, Max, ans); - } - generateUniformHash(N, Max, ans); - } - - - - // Max value is excluded from range - vector generate(uint32_t N, uint32_t Max) { - vector ans; - ans.reserve(N); - fastgenerateUniform(N, Max, ans); - return ans; - } - ZRandom rand; - + generateUniformHash(N, Max, ans); + } + + // Max value is excluded from range + vector generate(uint32_t N, uint32_t Max) { + vector ans; + ans.reserve(N); + fastgenerateUniform(N, Max, ans); + return ans; + } + ZRandom rand; }; /* @@ -125,226 +123,217 @@ class UniformDataGenerator { */ class ClusteredDataGenerator { public: - - vector buffer; - UniformDataGenerator unidg; - ClusteredDataGenerator(uint32_t seed = static_cast(time(NULL))) : - buffer(), unidg(seed) { + vector buffer; + UniformDataGenerator unidg; + ClusteredDataGenerator(uint32_t seed = static_cast(time(NULL))) + : buffer(), unidg(seed) {} + + // Max value is excluded from range + template + void fillUniform(iterator begin, iterator end, uint32_t Min, uint32_t Max) { + unidg.fastgenerateUniform(static_cast(end - begin), Max - Min, + buffer); + for (size_t k = 0; k < buffer.size(); ++k) + *(begin + k) = Min + buffer[k]; + } + + // Max value is excluded from range + // throws exception if impossible + template + void fillClustered(iterator begin, iterator end, uint32_t Min, uint32_t Max) { + const uint32_t N = static_cast(end - begin); + const uint32_t range = Max - Min; + if (range < N) + throw runtime_error("can't generate that many in small interval."); + assert(range >= N); + if ((range == N) or (N < 10)) { + fillUniform(begin, end, Min, Max); + return; } - - // Max value is excluded from range - template - void fillUniform(iterator begin, iterator end, uint32_t Min, uint32_t Max) { - unidg.fastgenerateUniform(static_cast(end - begin), Max - Min, buffer); - for (size_t k = 0; k < buffer.size(); ++k) - *(begin + k) = Min + buffer[k]; + const uint32_t cut = N / 2 + unidg.rand.getValue(range - N); + assert(cut >= N / 2); + assert(Max - Min - cut >= N - N / 2); + const double p = unidg.rand.getDouble(); + assert(p <= 1); + assert(p >= 0); + if (p <= 0.25) { + fillUniform(begin, begin + N / 2, Min, Min + cut); + fillClustered(begin + N / 2, end, Min + cut, Max); + } else if (p <= 0.5) { + fillClustered(begin, begin + N / 2, Min, Min + cut); + fillUniform(begin + N / 2, end, Min + cut, Max); + } else { + fillClustered(begin, begin + N / 2, Min, Min + cut); + fillClustered(begin + N / 2, end, Min + cut, Max); } + } + // Max value is excluded from range + vector generate(uint32_t N, uint32_t Max) { + return generateClustered(N, Max); + } - // Max value is excluded from range - // throws exception if impossible - template - void fillClustered(iterator begin, iterator end, uint32_t Min, uint32_t Max) { - const uint32_t N = static_cast(end - begin); - const uint32_t range = Max - Min; - if (range < N) throw runtime_error("can't generate that many in small interval."); - assert(range >= N); - if ((range == N) or (N < 10)) { - fillUniform(begin, end, Min, Max); - return; - } - const uint32_t cut = N / 2 + unidg.rand.getValue(range - N); - assert(cut >= N / 2); - assert(Max - Min - cut >= N - N / 2); - const double p = unidg.rand.getDouble(); - assert(p <= 1); - assert(p >= 0); - if (p <= 0.25) { - fillUniform(begin, begin + N / 2, Min, Min + cut); - fillClustered(begin + N / 2, end, Min + cut, Max); - } else if (p <= 0.5) { - fillClustered(begin, begin + N / 2, Min, Min + cut); - fillUniform(begin + N / 2, end, Min + cut, Max); - } else { - fillClustered(begin, begin + N / 2, Min, Min + cut); - fillClustered(begin + N / 2, end, Min + cut, Max); - } - } - - // Max value is excluded from range - vector generate(uint32_t N, uint32_t Max) { - return generateClustered(N, Max); - } - - // Max value is excluded from range - vector generateClustered(uint32_t N, uint32_t Max) { - vector ans(N); - fillClustered(ans.begin(), ans.end(), 0, Max); - return ans; - } - + // Max value is excluded from range + vector generateClustered(uint32_t N, uint32_t Max) { + vector ans(N); + fillClustered(ans.begin(), ans.end(), 0, Max); + return ans; + } }; class ZipfianGenerator { public: - - uint32_t n; - double zetan, theta; - vector proba; - - ZRandom rand; - ZipfianGenerator(uint32_t seed = static_cast(time(NULL))) : - n(0), zetan(0), theta(0), proba(n), rand(seed) { + uint32_t n; + double zetan, theta; + vector proba; + + ZRandom rand; + ZipfianGenerator(uint32_t seed = static_cast(time(NULL))) + : n(0), zetan(0), theta(0), proba(n), rand(seed) {} + + void init(int _items, double _zipfianconstant = 1.0) { + n = _items; + if (_items == 0) + throw runtime_error("no items?"); + theta = _zipfianconstant; + if (theta > 0) { + zetan = 1 / zeta(n, theta); + proba.clear(); + proba.resize(n, 0); + proba[0] = zetan; + for (uint32_t i = 1; i < n; ++i) + proba[i] = proba[i - 1] + zetan / pow(i + 1, theta); + } else { + proba.resize(n, 1.0 / n); } + } - void init(int _items, double _zipfianconstant = 1.0) { - n = _items; - if (_items == 0) throw runtime_error("no items?"); - theta = _zipfianconstant; - if (theta > 0) { - zetan = 1 / zeta(n, theta); - proba.clear(); - proba.resize(n, 0); - proba[0] = zetan; - for (uint32_t i = 1; i < n; ++i) - proba[i] = proba[i - 1] + zetan / pow(i + 1, theta); - } else { - proba.resize(n, 1.0 / n); - } - } - - void seed(uint32_t s) { - rand.seed(s); - } + void seed(uint32_t s) { rand.seed(s); } - ZipfianGenerator(int _items, double _zipfianconstant, - uint32_t seed = static_cast(time(NULL))) : - n(_items), zetan(0), theta(_zipfianconstant), proba(n), rand(seed) { - init(_items, _zipfianconstant); - } + ZipfianGenerator(int _items, double _zipfianconstant, + uint32_t seed = static_cast(time(NULL))) + : n(_items), zetan(0), theta(_zipfianconstant), proba(n), rand(seed) { + init(_items, _zipfianconstant); + } - double zeta(int n, double theta) { - double sum = 0; - for (long i = 0; i < n; i++) { - sum += 1 / (pow(i + 1, theta)); - } - return sum; - } - int nextInt() { - // Map z to the value - const double u = rand.getDouble(); - return static_cast(lower_bound(proba.begin(), proba.end(), u) - proba.begin()); + double zeta(int n, double theta) { + double sum = 0; + for (long i = 0; i < n; i++) { + sum += 1 / (pow(i + 1, theta)); } - + return sum; + } + int nextInt() { + // Map z to the value + const double u = rand.getDouble(); + return static_cast(lower_bound(proba.begin(), proba.end(), u) - + proba.begin()); + } }; vector generateZipfianArray32(uint32_t N, double power, const uint32_t mask = 0xFFFFFFFFU) { - vector ans(N); - ZipfianGenerator zipf; - const uint32_t MAXVALUE = 1U << 22; - zipf.init(mask > MAXVALUE - 1 ? MAXVALUE : mask + 1, power); - for (size_t k = 0; k < N; ++k) - ans[k] = zipf.nextInt(); - return ans; + vector ans(N); + ZipfianGenerator zipf; + const uint32_t MAXVALUE = 1U << 22; + zipf.init(mask > MAXVALUE - 1 ? MAXVALUE : mask + 1, power); + for (size_t k = 0; k < N; ++k) + ans[k] = zipf.nextInt(); + return ans; } - - -size_t unite(const uint32_t *set1, const size_t length1, - const uint32_t *set2, const size_t length2, uint32_t *out) { - size_t pos = 0; - size_t k1 = 0, k2 = 0; - if (0 == length1) { - for (size_t k = 0; k < length2; ++k) - out[k] = set2[k]; - return length2; +size_t unite(const uint32_t *set1, const size_t length1, const uint32_t *set2, + const size_t length2, uint32_t *out) { + size_t pos = 0; + size_t k1 = 0, k2 = 0; + if (0 == length1) { + for (size_t k = 0; k < length2; ++k) + out[k] = set2[k]; + return length2; + } + if (0 == length2) { + for (size_t k = 0; k < length1; ++k) + out[k] = set1[k]; + return length1; + } + while (true) { + if (set1[k1] < set2[k2]) { + out[pos++] = set1[k1]; + ++k1; + if (k1 >= length1) { + for (; k2 < length2; ++k2) + out[pos++] = set2[k2]; + break; + } + } else if (set1[k1] == set2[k2]) { + out[pos++] = set1[k1]; + ++k1; + ++k2; + if (k1 >= length1) { + for (; k2 < length2; ++k2) + out[pos++] = set2[k2]; + break; + } + if (k2 >= length2) { + for (; k1 < length1; ++k1) + out[pos++] = set1[k1]; + break; + } + } else { // if (set1[k1]>set2[k2]) { + out[pos++] = set2[k2]; + ++k2; + if (k2 >= length2) { + for (; k1 < length1; ++k1) + out[pos++] = set1[k1]; + break; + } } - if (0 == length2) { - for (size_t k = 0; k < length1; ++k) - out[k] = set1[k]; - return length1; - } - while (true) { - if (set1[k1] < set2[k2]) { - out[pos++] = set1[k1]; - ++k1; - if (k1 >= length1) { - for (; k2 < length2; ++k2) - out[pos++] = set2[k2]; - break; - } - } else if (set1[k1] == set2[k2]) { - out[pos++] = set1[k1]; - ++k1; - ++k2; - if (k1 >= length1) { - for (; k2 < length2; ++k2) - out[pos++] = set2[k2]; - break; - } - if (k2 >= length2) { - for (; k1 < length1; ++k1) - out[pos++] = set1[k1]; - break; - } - } else {// if (set1[k1]>set2[k2]) { - out[pos++] = set2[k2]; - ++k2; - if (k2 >= length2) { - for (; k1 < length1; ++k1) - out[pos++] = set1[k1]; - break; - } - } - } - return pos; + } + return pos; } - vector unite(const vector &x, const vector &y) { - vector ans(x.size() + y.size()); - ans.resize(unite(x.data(), x.size(), y.data(), y.size(), ans.data())); - return ans; + vector ans(x.size() + y.size()); + ans.resize(unite(x.data(), x.size(), y.data(), y.size(), ans.data())); + return ans; } -size_t classicalintersection(const uint32_t *set1, - const size_t length1, const uint32_t *set2, const size_t length2, uint32_t *out) { - if ((0 == length1) or (0 == length2)) - return 0; - size_t answer = 0; - size_t k1 = 0, k2 = 0; - while (true) { - if (set1[k1] < set2[k2]) { - ++k1; - if (k1 == length1) - return answer; - } else if (set2[k2] < set1[k1]) { - ++k2; - if (k2 == length2) - return answer; - } else { - // (set2[k2] == set1[k1]) - out[answer++] = set1[k1]; - ++k1; - if (k1 == length1) - break; - ++k2; - if (k2 == length2) - break; - - } +size_t classicalintersection(const uint32_t *set1, const size_t length1, + const uint32_t *set2, const size_t length2, + uint32_t *out) { + if ((0 == length1) or (0 == length2)) + return 0; + size_t answer = 0; + size_t k1 = 0, k2 = 0; + while (true) { + if (set1[k1] < set2[k2]) { + ++k1; + if (k1 == length1) + return answer; + } else if (set2[k2] < set1[k1]) { + ++k2; + if (k2 == length2) + return answer; + } else { + // (set2[k2] == set1[k1]) + out[answer++] = set1[k1]; + ++k1; + if (k1 == length1) + break; + ++k2; + if (k2 == length2) + break; } - return answer; - + } + return answer; } - - -vector intersect(const vector &x, const vector &y) { - vector ans(x.size() + y.size()); - ans.resize(classicalintersection(x.data(), x.size(), y.data(), y.size(), ans.data())); - return ans; +vector intersect(const vector &x, + const vector &y) { + vector ans(x.size() + y.size()); + ans.resize(classicalintersection(x.data(), x.size(), y.data(), y.size(), + ans.data())); + return ans; } /** * Generate a pair of arrays. One small, one larger. @@ -355,31 +344,46 @@ vector intersect(const vector &x, const vector &y) * intersectionratio * minlength : length of the intersection */ template -pair, vector> getPair(generator gen, uint32_t minlength, uint32_t Max, float sizeratio, -float intersectionratio) { - if (sizeratio < 1) throw runtime_error("sizeratio should be larger or equal to 1"); - if (intersectionratio < 0) throw runtime_error("intersectionratio should be positive"); - if (intersectionratio > 1) throw runtime_error("intersectionratio cannot be larger than 1"); - const uint32_t maxlenth = static_cast(round(static_cast(minlength) * sizeratio)); - if (maxlenth > Max) throw runtime_error("I can't generate an array so large in such a small range."); - if (maxlenth < minlength) throw runtime_error("something went wrong, possibly an overflow."); - // we basically assume that, if we do nothing, intersections are very small - const uint32_t intersize = static_cast(round(static_cast(minlength) * intersectionratio)); - - vector inter = gen.generate(intersize, Max); - vector smallest = unite(gen.generate(static_cast(minlength - inter.size()), Max), inter); - vector largest = unite(gen.generate(static_cast(maxlenth - inter.size()), Max), inter); - vector intersection = intersect(smallest, largest); - - if (abs(static_cast(intersection.size()) / static_cast(smallest.size()) - intersectionratio) > 0.05) - throw runtime_error("Bad intersection ratio. Fix me."); - - if (abs(static_cast(largest.size()) / static_cast(smallest.size()) - sizeratio) > 0.05) - throw runtime_error("Bad size ratio. Fix me."); - return pair, vector>(smallest, largest); +pair, vector> +getPair(generator gen, uint32_t minlength, uint32_t Max, float sizeratio, + float intersectionratio) { + if (sizeratio < 1) + throw runtime_error("sizeratio should be larger or equal to 1"); + if (intersectionratio < 0) + throw runtime_error("intersectionratio should be positive"); + if (intersectionratio > 1) + throw runtime_error("intersectionratio cannot be larger than 1"); + const uint32_t maxlenth = + static_cast(round(static_cast(minlength) * sizeratio)); + if (maxlenth > Max) + throw runtime_error( + "I can't generate an array so large in such a small range."); + if (maxlenth < minlength) + throw runtime_error("something went wrong, possibly an overflow."); + // we basically assume that, if we do nothing, intersections are very small + const uint32_t intersize = static_cast( + round(static_cast(minlength) * intersectionratio)); + + vector inter = gen.generate(intersize, Max); + vector smallest = + unite(gen.generate(static_cast(minlength - inter.size()), Max), + inter); + vector largest = unite( + gen.generate(static_cast(maxlenth - inter.size()), Max), inter); + vector intersection = intersect(smallest, largest); + + if (abs(static_cast(intersection.size()) / + static_cast(smallest.size()) - + intersectionratio) > 0.05) + throw runtime_error("Bad intersection ratio. Fix me."); + + if (abs(static_cast(largest.size()) / + static_cast(smallest.size()) - + sizeratio) > 0.05) + throw runtime_error("Bad size ratio. Fix me."); + return pair, vector>(smallest, largest); } - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_SYNTHETIC_H_ */ diff --git a/include/timer.h b/include/timer.h index 5a6b459..9a46611 100644 --- a/include/timer.h +++ b/include/timer.h @@ -7,7 +7,6 @@ #ifndef SIMDCompressionAndIntersection_TIMER_H_ #define SIMDCompressionAndIntersection_TIMER_H_ - #include #include #include @@ -17,73 +16,67 @@ namespace SIMDCompressionLib { class WallClockTimer { public: #ifdef _WIN32 - typedef qpc_clock clock; + typedef qpc_clock clock; #else - typedef std::chrono::high_resolution_clock clock; + typedef std::chrono::high_resolution_clock clock; #endif - std::chrono::time_point t1, t2; - WallClockTimer() : - t1(), t2() { - t1 = clock::now(); - t2 = t1; - } - void reset() { - t1 = clock::now(); - t2 = t1; - } - uint64_t elapsed() { - std::chrono::microseconds delta = std::chrono::duration_cast(t2 - t1); - return delta.count(); - } - uint64_t split() { - t2 = clock::now(); - return elapsed(); - } + std::chrono::time_point t1, t2; + WallClockTimer() : t1(), t2() { + t1 = clock::now(); + t2 = t1; + } + void reset() { + t1 = clock::now(); + t2 = t1; + } + uint64_t elapsed() { + std::chrono::microseconds delta = + std::chrono::duration_cast(t2 - t1); + return delta.count(); + } + uint64_t split() { + t2 = clock::now(); + return elapsed(); + } }; - #ifndef _WIN32 class CPUTimer { public: - //clock_t t1, t2; - struct rusage t1, t2; + // clock_t t1, t2; + struct rusage t1, t2; - CPUTimer() : - t1(), t2() { - getrusage(RUSAGE_SELF, &t1); - //t1 = clock(); - t2 = t1; - } - void reset() { - getrusage(RUSAGE_SELF, &t1); - t2 = t1; - } - // proxy for userelapsed - uint64_t elapsed() { - return totalelapsed(); - } + CPUTimer() : t1(), t2() { + getrusage(RUSAGE_SELF, &t1); + // t1 = clock(); + t2 = t1; + } + void reset() { + getrusage(RUSAGE_SELF, &t1); + t2 = t1; + } + // proxy for userelapsed + uint64_t elapsed() { return totalelapsed(); } - uint64_t totalelapsed() { - return userelapsed() + systemelapsed(); - } - // returns the *user* CPU time in micro seconds (mu s) - uint64_t userelapsed() { - return ((t2.ru_utime.tv_sec - t1.ru_utime.tv_sec) * 1000ULL * 1000ULL) + ((t2.ru_utime.tv_usec - t1.ru_utime.tv_usec) - ); - } + uint64_t totalelapsed() { return userelapsed() + systemelapsed(); } + // returns the *user* CPU time in micro seconds (mu s) + uint64_t userelapsed() { + return ((t2.ru_utime.tv_sec - t1.ru_utime.tv_sec) * 1000ULL * 1000ULL) + + ((t2.ru_utime.tv_usec - t1.ru_utime.tv_usec)); + } - // returns the *system* CPU time in micro seconds (mu s) - uint64_t systemelapsed() { - return ((t2.ru_stime.tv_sec - t1.ru_stime.tv_sec) * 1000ULL * 1000ULL) + ((t2.ru_stime.tv_usec - t1.ru_stime.tv_usec) - ); - } + // returns the *system* CPU time in micro seconds (mu s) + uint64_t systemelapsed() { + return ((t2.ru_stime.tv_sec - t1.ru_stime.tv_sec) * 1000ULL * 1000ULL) + + ((t2.ru_stime.tv_usec - t1.ru_stime.tv_usec)); + } - uint64_t split() { - getrusage(RUSAGE_SELF, &t2); - return elapsed(); - } + uint64_t split() { + getrusage(RUSAGE_SELF, &t2); + return elapsed(); + } }; #endif diff --git a/include/usimdbitpacking.h b/include/usimdbitpacking.h index c1437ca..8822bea 100644 --- a/include/usimdbitpacking.h +++ b/include/usimdbitpacking.h @@ -11,111 +11,139 @@ namespace SIMDCompressionLib { +void __uSIMD_fastunpack1(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack2(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack3(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack4(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack5(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack6(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack7(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack8(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack9(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack10(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack11(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack12(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack13(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack14(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack15(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack16(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack17(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack18(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack19(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack20(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack21(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack22(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack23(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack24(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack25(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack26(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack27(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack28(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack29(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack30(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack31(const __m128i *__restrict__, uint32_t *__restrict__); +void __uSIMD_fastunpack32(const __m128i *__restrict__, uint32_t *__restrict__); -void __uSIMD_fastunpack1(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack2(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack3(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack4(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack5(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack6(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack7(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack8(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack9(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack10(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack11(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack12(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack13(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack14(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack15(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack16(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack17(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack18(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack19(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack20(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack21(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack22(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack23(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack24(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack25(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack26(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack27(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack28(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack29(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack30(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack31(const __m128i * __restrict__, uint32_t * __restrict__); -void __uSIMD_fastunpack32(const __m128i * __restrict__, uint32_t * __restrict__); - - -void __uSIMD_fastpackwithoutmask0(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask1(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask2(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask3(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask4(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask5(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask6(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask7(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask8(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask9(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask10(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask11(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask12(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask13(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask14(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask15(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask16(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask17(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask18(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask19(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask20(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask21(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask22(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask23(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask24(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask25(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask26(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask27(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask28(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask29(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask30(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask31(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpackwithoutmask32(const uint32_t * __restrict__, __m128i * __restrict__); - -void __uSIMD_fastpack0(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack1(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack2(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack3(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack4(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack5(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack6(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack7(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack8(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack9(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack10(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack11(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack12(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack13(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack14(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack15(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack16(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack17(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack18(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack19(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack20(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack21(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack22(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack23(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack24(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack25(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack26(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack27(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack28(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack29(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack30(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack31(const uint32_t * __restrict__, __m128i * __restrict__); -void __uSIMD_fastpack32(const uint32_t * __restrict__, __m128i * __restrict__); - - +void __uSIMD_fastpackwithoutmask0(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask1(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask2(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask3(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask4(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask5(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask6(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask7(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask8(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask9(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask10(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask11(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask12(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask13(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask14(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask15(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask16(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask17(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask18(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask19(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask20(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask21(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask22(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask23(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask24(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask25(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask26(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask27(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask28(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask29(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask30(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask31(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpackwithoutmask32(const uint32_t *__restrict__, + __m128i *__restrict__); +void __uSIMD_fastpack0(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack1(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack2(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack3(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack4(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack5(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack6(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack7(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack8(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack9(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack10(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack11(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack12(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack13(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack14(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack15(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack16(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack17(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack18(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack19(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack20(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack21(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack22(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack23(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack24(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack25(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack26(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack27(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack28(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack29(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack30(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack31(const uint32_t *__restrict__, __m128i *__restrict__); +void __uSIMD_fastpack32(const uint32_t *__restrict__, __m128i *__restrict__); } // namespace SIMDCompressionLib diff --git a/include/util.h b/include/util.h index 48d87c3..f94b7dd 100644 --- a/include/util.h +++ b/include/util.h @@ -13,28 +13,30 @@ namespace SIMDCompressionLib { inline uint32_t random(int b) { - if (b == 32) return rand(); - return rand() % (1U << b); + if (b == 32) + return rand(); + return rand() % (1U << b); } // taken from stackoverflow #ifndef NDEBUG -# define ASSERT(condition, message) \ - do { \ - if (! (condition)) { \ - std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \ - << " line " << __LINE__ << ": " << message << std::endl; \ - std::exit(EXIT_FAILURE); \ - } \ - } while (false) +#define ASSERT(condition, message) \ + do { \ + if (!(condition)) { \ + std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \ + << " line " << __LINE__ << ": " << message << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } while (false) #else -# define ASSERT(condition, message) do { } while (false) +#define ASSERT(condition, message) \ + do { \ + } while (false) #endif - CONST_FUNCTION inline uint32_t gccbits(const uint32_t v) { - return v == 0 ? 0 : 32 - __builtin_clz(v); + return v == 0 ? 0 : 32 - __builtin_clz(v); } /** @@ -42,98 +44,82 @@ inline uint32_t gccbits(const uint32_t v) { * number of bits used (integer logarithm). */ inline uint32_t maxbitas32int(const __m128i accumulator) { - SIMDCOMP_ALIGNED(16) uint32_t tmparray[4]; - MM_STORE_SI_128(reinterpret_cast<__m128i *>(tmparray), accumulator); - return gccbits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]); + SIMDCOMP_ALIGNED(16) uint32_t tmparray[4]; + MM_STORE_SI_128(reinterpret_cast<__m128i *>(tmparray), accumulator); + return gccbits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]); } -static CONST_FUNCTION -bool divisibleby(size_t a, uint32_t x) { - return (a % x == 0); +static CONST_FUNCTION bool divisibleby(size_t a, uint32_t x) { + return (a % x == 0); } #ifdef __GNUC__ __attribute__((unused)) #endif -static void checkifdivisibleby(size_t a, uint32_t x) { - if (!divisibleby(a, x)) { - std::ostringstream convert; - convert << a << " not divisible by " << x; - throw std::logic_error(convert.str()); - } +static void +checkifdivisibleby(size_t a, uint32_t x) { + if (!divisibleby(a, x)) { + std::ostringstream convert; + convert << a << " not divisible by " << x; + throw std::logic_error(convert.str()); + } } - - -template -PURE_FUNCTION -uint32_t maxbits(const iterator &begin, const iterator &end) { - uint32_t accumulator = 0; - for (iterator k = begin; k != end; ++k) { - accumulator |= *k; - } - return gccbits(accumulator); +template +PURE_FUNCTION uint32_t maxbits(const iterator &begin, const iterator &end) { + uint32_t accumulator = 0; + for (iterator k = begin; k != end; ++k) { + accumulator |= *k; + } + return gccbits(accumulator); } - template -CONST_FUNCTION -inline bool needPaddingTo128Bits(const T *inbyte) { - return (reinterpret_cast(inbyte) & 15) != 0; +CONST_FUNCTION inline bool needPaddingTo128Bits(const T *inbyte) { + return (reinterpret_cast(inbyte) & 15) != 0; } - - - template -CONST_FUNCTION -inline bool needPaddingTo32Bits(const T *inbyte) { - return (reinterpret_cast(inbyte) & 3) != 0; +CONST_FUNCTION inline bool needPaddingTo32Bits(const T *inbyte) { + return (reinterpret_cast(inbyte) & 3) != 0; } -template -CONST_FUNCTION -T *padTo32bits(T *inbyte) { - return reinterpret_cast((reinterpret_cast(inbyte) - + 3) & ~3); +template CONST_FUNCTION T *padTo32bits(T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 3) & ~3); } -template -CONST_FUNCTION -const T *padTo32bits(const T *inbyte) { - return reinterpret_cast((reinterpret_cast(inbyte) - + 3) & ~3); +template CONST_FUNCTION const T *padTo32bits(const T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 3) & + ~3); } - #ifndef _MSC_VER CONST_FUNCTION inline uint32_t asmbits(const uint32_t v) { - if (v == 0) - return 0; - uint32_t answer; - __asm__("bsr %1, %0;" :"=r"(answer) :"r"(v)); - return answer + 1; + if (v == 0) + return 0; + uint32_t answer; + __asm__("bsr %1, %0;" : "=r"(answer) : "r"(v)); + return answer + 1; } #else inline uint32_t asmbits(const uint32_t v) { - unsigned long index; - return (v == 0 || _BitScanReverse(&index, v) == 0) ? 0 : (index + 1); + unsigned long index; + return (v == 0 || _BitScanReverse(&index, v) == 0) ? 0 : (index + 1); } #endif - template -bool is_strictlysorted(iterator first, iterator last) { - iterator next = first; +bool is_strictlysorted(iterator first, iterator last) { + iterator next = first; + ++next; + while (next < last) { + if (*first >= *next) + return false; + ++first; ++next; - while (next < last) { - if (*first >= *next) - return false; - ++first; - ++next; - } - return true; + } + return true; } } // namespace SIMDCompressionLib diff --git a/include/variablebyte.h b/include/variablebyte.h index 5505864..454000d 100644 --- a/include/variablebyte.h +++ b/include/variablebyte.h @@ -1,1337 +1,1328 @@ -/** - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - * (c) Daniel Lemire, http://lemire.me/en/ - */ - -#ifndef SIMDCompressionAndIntersection_VARIABLEBYTE_H_ -#define SIMDCompressionAndIntersection_VARIABLEBYTE_H_ -#include "common.h" -#include "codecs.h" -#include "util.h" - -namespace SIMDCompressionLib { - -/*** - * VariableByte and VByte are basically identical, except that - * one uses 0..0..0..1 to indicate 4 whereas the other one uses 1..1..1..0. - * The latter is maybe more common. - */ - - - -template -class VariableByte: public IntegerCODEC { -public: - - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - uint8_t *bout = reinterpret_cast(out); - const uint8_t * const initbout = reinterpret_cast(out); - size_t bytenvalue = nvalue * sizeof(uint32_t); - encodeToByteArray(in, length, bout,bytenvalue); - bout += bytenvalue; - while (needPaddingTo32Bits(bout)) { - *bout++ = 0; - } - const size_t storageinbytes = bout - initbout; - assert((storageinbytes % 4) == 0); - nvalue = storageinbytes / 4; - } - - // write one compressed integer (without differential coding) - // returns the number of bytes written - size_t encodeOneIntegerToByteArray(uint32_t val, uint8_t *bout) { - const uint8_t * const initbout = bout; - if (val < (1U << 7)) { - *bout = static_cast(val | (1U << 7)); - ++bout; - } else if (val < (1U << 14)) { - *bout = extract7bits < 0 > (val); - ++bout; - *bout = extract7bitsmaskless < 1 > (val) | (1U << 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = extract7bits < 0 > (val); - ++bout; - *bout = extract7bits < 1 > (val); - ++bout; - *bout = extract7bitsmaskless < 2 > (val) | (1U << 7); - ++bout; - } else if (val < (1U << 28)) { - *bout = extract7bits < 0 > (val); - ++bout; - *bout = extract7bits < 1 > (val); - ++bout; - *bout = extract7bits < 2 > (val); - ++bout; - *bout = extract7bitsmaskless < 3 > (val) | (1U << 7); - ++bout; - } else { - *bout = extract7bits < 0 > (val); - ++bout; - *bout = extract7bits < 1 > (val); - ++bout; - *bout = extract7bits < 2 > (val); - ++bout; - *bout = extract7bits < 3 > (val); - ++bout; - *bout = extract7bitsmaskless < 4 > (val) | (1U << 7); - ++bout; - } - return bout - initbout; - } - - void encodeToByteArray(uint32_t *in, const size_t length, uint8_t *bout, - size_t &nvalue) { - const uint8_t * const initbout = bout; - uint32_t prev = 0; - for (size_t k = 0; k < length; ++k) { - const uint32_t val = delta ? in[k] - prev : in[k]; - if (delta) - prev = in[k]; - /** - * Code below could be shorter. Whether it could be faster - * depends on your compiler and machine. - */ - if (val < (1U << 7)) { - *bout = static_cast(val | (1U << 7)); - ++bout; - } else if (val < (1U << 14)) { - *bout = extract7bits<0>(val); - ++bout; - *bout = extract7bitsmaskless<1>(val) | (1U << 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = extract7bits<0>(val); - ++bout; - *bout = extract7bits<1>(val); - ++bout; - *bout = extract7bitsmaskless<2>(val) | (1U << 7); - ++bout; - } else if (val < (1U << 28)) { - *bout = extract7bits<0>(val); - ++bout; - *bout = extract7bits<1>(val); - ++bout; - *bout = extract7bits<2>(val); - ++bout; - *bout = extract7bitsmaskless<3>(val) | (1U << 7); - ++bout; - } else { - *bout = extract7bits<0>(val); - ++bout; - *bout = extract7bits<1>(val); - ++bout; - *bout = extract7bits<2>(val); - ++bout; - *bout = extract7bits<3>(val); - ++bout; - *bout = extract7bitsmaskless<4>(val) | (1U << 7); - ++bout; - } - } - nvalue = bout - initbout; - } - - const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) { - decodeFromByteArray((const uint8_t *)in, length * sizeof(uint32_t),out,nvalue); - return in + length; - } - - // determine how many padding bytes were used - int paddingBytes(const uint32_t *in, const size_t length) { - if(length == 0) return 0; - uint32_t lastword = in[length - 1]; - if (lastword < (1U << 8)) { - return 3; - } else if (lastword < (1U << 16)) { - return 2; - } else if (lastword < (1U << 24)) { - return 1; - } - return 0; - } - - - // how many bytes are required to store this integer? - int storageCost(uint32_t val) { - if (val < (1U << 7)) { - return 1; - } else if (val < (1U << 14)) { - return 2; - } else if (val < (1U << 21)) { - return 3; - } else if (val < (1U << 28)) { - return 4; - } else { - return 5; - } - } - - - const uint8_t *decodeFromByteArray(const uint8_t *inbyte, const size_t length, - uint32_t *out, size_t &nvalue) { - uint32_t prev = 0; - if (length == 0) { - nvalue = 0; - return inbyte; //abort - } - const uint8_t * const endbyte = inbyte + length; - const uint32_t * const initout(out); - // this assumes that there is a value to be read - - while(endbyte > inbyte + 5) { - if (delta) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c >= 128) { - inbyte += 1; - *out++ = (prev = v + prev); - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c >= 128) { - inbyte += 2; - *out++ = (prev = v + prev); - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c >= 128) { - inbyte += 3; - *out++ = (prev = v + prev); - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c >= 128) { - inbyte += 4; - *out++ = (prev = v + prev); - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - *out++ = (prev = v + prev); - } else { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c >= 128) { - inbyte += 1; - *out++ = v; - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c >= 128) { - inbyte += 2; - *out++ = v; - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c >= 128) { - inbyte += 3; - *out++ = v; - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c >= 128) { - inbyte += 4; - *out++ = v; - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - *out++ = v; - } - } - while (endbyte > inbyte) { - unsigned int shift = 0; - for (uint32_t v = 0; endbyte > inbyte; shift += 7) { - uint8_t c = *inbyte++; - v += ((c & 127) << shift); - if ((c & 128)) { - *out++ = delta ? (prev = v + prev) : v; - break; - } - } - } - nvalue = out - initout; - return inbyte ; - } - - - // Performs a lower bound find in the encoded array. - // length is the size of the compressed input - // Returns the index - size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, - uint32_t *presult) { - uint32_t prev = 0; - if (length == 0) { - return 0; //abort - } - const uint8_t *inbyte = reinterpret_cast(in); - const uint8_t * const endbyte = reinterpret_cast(in - + length); - size_t i = 0; - // this assumes that there is a value to be read - - while (endbyte > inbyte + 5) { - if (delta) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c >= 128) { - inbyte += 1; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c >= 128) { - inbyte += 2; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c >= 128) { - inbyte += 3; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c >= 128) { - inbyte += 4; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - } else { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c >= 128) { - inbyte += 1; - if (v >= key) { - *presult = v; - return i; - } - i++; - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c >= 128) { - inbyte += 2; - if (v >= key) { - *presult = v; - return i; - } - i++; - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c >= 128) { - inbyte += 3; - if (v >= key) { - *presult = v; - return i; - } - i++; - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c >= 128) { - inbyte += 4; - if (v >= key) { - *presult = v; - return i; - } - i++; - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - if (v >= key) { - *presult = v; - return i; - } - i++; - } - } - while (endbyte > inbyte) { - unsigned int shift = 0; - for (uint32_t v = 0; endbyte > inbyte; shift += 7) { - uint8_t c = *inbyte++; - v += ((c & 127) << shift); - if ((c & 128)) { - if (delta) { - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - } else { - if (v >= key) { - *presult = v; - return i; - } - } - i++; - break; - } - } - } - return i; - } - - // append a key. Keys must be in sorted order. We assume that there is - // enough room and that delta encoding was used. - // Returns the new size of the compressed array *in bytes* - size_t appendToByteArray(uint8_t *in, const size_t bytesize, uint32_t previous_key, - uint32_t key) { - assert(delta);// no performance impact expected. - uint8_t *byteininit = (uint8_t *)in; - uint8_t *bytein = (uint8_t *)in + bytesize; - bytein += encodeOneIntegerToByteArray(key - previous_key, bytein); - return bytein - byteininit; - } - - // insert the key in sorted order. We assume that there is enough room and that delta encoding was used. - size_t insert(uint32_t *in, const size_t length, uint32_t key) { - size_t bytesize = length * 4; - bytesize -= paddingBytes(in,length); - uint8_t * bytein = (uint8_t *) in; - uint8_t * byteininit = bytein; - size_t bl = insertInByteArray(bytein, bytesize, key); - bytein += bl; - - while (needPaddingTo32Bits(bytein)) { - *bytein++ = 0; - } - size_t storageinbytes = bytein - byteininit; - assert((storageinbytes % 4) == 0); - return storageinbytes / 4; - } - - - // insert the key in sorted order. We assume that there is enough room and that delta encoding was used. - // the new size is returned - size_t insertInByteArray(uint8_t *inbyte, const size_t length, uint32_t key) { - uint32_t prev = 0; - assert(delta); - const uint8_t * const endbyte = reinterpret_cast(inbyte - + length); - // this assumes that there is a value to be read - - while (endbyte > inbyte + 5) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c >= 128) { - inbyte += 1; - prev = v + prev; - if (prev >= key) { - return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c >= 128) { - inbyte += 2; - prev = v + prev; - if (prev >= key) { - return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c >= 128) { - inbyte += 3; - prev = v + prev; - if (prev >= key) { - return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c >= 128) { - inbyte += 4; - prev = v + prev; - if (prev >= key) { - return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - prev = v + prev; - if (prev >= key) { - return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - } - while (endbyte > inbyte) { - unsigned int shift = 0; - for (uint32_t v = 0; endbyte > inbyte; shift += 7) { - uint8_t c = *inbyte++; - v += ((c & 127) << shift); - if ((c & 128)) { - prev = v + prev; - if (prev >= key) { - return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - break; - } - } - } - // if we make it here, then we need to append - assert(key >= prev); - return length + encodeOneIntegerToByteArray(key - prev,inbyte); - } - - - // Returns a decompressed value in an encoded array - // could be greatly optimized in the non-differential coding case: currently just for delta coding - uint32_t select(uint32_t *in, size_t index) { - assert(delta); - uint32_t prev = 0; - size_t i = 0; - const uint8_t *inbyte = reinterpret_cast(in); - while(i <= index) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c >= 128) { - inbyte += 1; - prev = v + prev; - i++; - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c >= 128) { - inbyte += 2; - prev = v + prev; - i++; - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c >= 128) { - inbyte += 3; - prev = v + prev; - i++; - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c >= 128) { - inbyte += 4; - prev = v + prev; - i++; - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - prev = v + prev; - i++; - - } - assert(i == index + 1); - return prev; - - } - - - string name() const { - if (delta) - return "VariableByteDelta"; - else - return "VariableByte"; - } -private: - - // convenience function used by insert, writes key and newvalue to compressed stream, and return - // extra storage used, pointer should be right after where nextvalue is right now - size_t __insert(uint8_t *in, uint32_t previous, uint32_t key, uint32_t nextvalue, size_t followingbytes) { - assert(nextvalue >= key); - assert(key >= previous); - size_t oldstorage = storageCost(nextvalue - previous); - size_t newstorage = storageCost(nextvalue - key) + storageCost(key - previous); - assert(newstorage >= oldstorage); - if(newstorage > oldstorage) std::memmove(in + newstorage - oldstorage, in,followingbytes); - uint8_t *newin = in - oldstorage; - newin += encodeOneIntegerToByteArray(key - previous, newin); - newin += encodeOneIntegerToByteArray(nextvalue - key, newin); - assert(newin == in + newstorage - oldstorage); - return newstorage - oldstorage; - } - - template - uint8_t extract7bits(const uint32_t val) { - return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); - } - - template - uint8_t extract7bitsmaskless(const uint32_t val) { - return static_cast((val >> (7 * i))); - } -}; - -template -class VByte: public IntegerCODEC { -public: - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - uint8_t *bout = reinterpret_cast(out); - const uint8_t * const initbout = reinterpret_cast(out); - size_t bytenvalue = nvalue * sizeof(uint32_t); - encodeToByteArray(in, length, bout,bytenvalue); - bout += bytenvalue; - while (needPaddingTo32Bits(bout)) { - *bout++ = 0xFF; - } - const size_t storageinbytes = bout - initbout; - assert((storageinbytes % 4) == 0); - nvalue = storageinbytes / 4; - } - - - void encodeToByteArray(uint32_t *in, const size_t length, uint8_t *bout, - size_t &nvalue) { - uint32_t prev = 0; - const uint8_t * const initbout = bout; - for (size_t k = 0; k < length; ++k) { - const uint32_t val = delta ? (in[k] - prev) : in[k]; - if (delta) - prev = in[k]; - /** - * Code below could be shorter. Whether it could be faster - * depends on your compiler and machine. - */ - if (val < (1U << 7)) { - *bout = val & 0x7F; - ++bout; - } else if (val < (1U << 14)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 14); - ++bout; - } else if (val < (1U << 28)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 21); - ++bout; - } else { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 28); - ++bout; - } - } - nvalue = bout - initbout; - - } - - // write one compressed integer (without differential coding) - // returns the number of bytes written - size_t encodeOneIntegerToByteArray(uint32_t val, uint8_t *bout) { - const uint8_t * const initbout = bout; - if (val < (1U << 7)) { - *bout = val & 0x7F; - ++bout; - } else if (val < (1U << 14)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 14); - ++bout; - } else if (val < (1U << 28)) { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 21); - ++bout; - } else { - *bout = static_cast((val & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); - ++bout; - *bout = static_cast(val >> 28); - ++bout; - } - return bout - initbout; - } - - - // determine how many padding bytes were used - int paddingBytes(const uint32_t *in, const size_t length) { - if(length == 0) return 0; - uint32_t lastword = in[length - 1]; - lastword = ~lastword; - if (lastword < (1U << 8)) { - return 3; - } else if (lastword < (1U << 16)) { - return 2; - } else if (lastword < (1U << 24)) { - return 1; - } - return 0; - } - - const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t &nvalue) { - decodeFromByteArray((const uint8_t *)in, length * sizeof(uint32_t),out,nvalue); - return in + length; - } - - - // how many bytes are required to store this integer? - int storageCost(uint32_t val) { - if (val < (1U << 7)) { - return 1; - } else if (val < (1U << 14)) { - return 2; - } else if (val < (1U << 21)) { - return 3; - } else if (val < (1U << 28)) { - return 4; - } else { - return 5; - } - } - - - const uint8_t *decodeFromByteArray(const uint8_t *inbyte, const size_t length, - uint32_t *out, size_t &nvalue) { - uint32_t prev = 0; - if (length == 0) { - nvalue = 0; - return inbyte; //abort - } - const uint8_t * const endbyte = inbyte + length; - const uint32_t * const initout(out); - // this assumes that there is a value to be read - - while(endbyte > inbyte + 5) { - if (delta) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c < 128) { - inbyte += 1; - *out++ = (prev = v + prev); - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c < 128) { - inbyte += 2; - *out++ = (prev = v + prev); - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c < 128) { - inbyte += 3; - *out++ = (prev = v + prev); - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c < 128) { - inbyte += 4; - *out++ = (prev = v + prev); - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - *out++ = (prev = v + prev); - } else { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c < 128) { - inbyte += 1; - *out++ = v; - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c < 128) { - inbyte += 2; - *out++ = v; - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c < 128) { - inbyte += 3; - *out++ = v; - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c < 128) { - inbyte += 4; - *out++ = v; - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - *out++ = v; - } - } - while (endbyte > inbyte) { - unsigned int shift = 0; - for (uint32_t v = 0; endbyte > inbyte; shift += 7) { - uint8_t c = *inbyte++; - v += ((c & 127) << shift); - if ((c < 128)) { - *out++ = delta ? (prev = v + prev) : v; - break; - } - } - } - nvalue = out - initout; - return inbyte; - } - - - // Performs a lower bound find in the encoded array. - // length is the size of the compressed input - // Returns the index and the value found (presult) - size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, - uint32_t *presult) { - uint32_t prev = 0; - if (length == 0) { - return 0; //abort - } - size_t i = 0; - const uint8_t *inbyte = reinterpret_cast(in); - const uint8_t * const endbyte = reinterpret_cast(in - + length); - // this assumes that there is a value to be read - - while (endbyte > inbyte + 5) { - if (delta) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c < 128) { - inbyte += 1; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c < 128) { - inbyte += 2; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c < 128) { - inbyte += 3; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c < 128) { - inbyte += 4; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - i++; - } else { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c < 128) { - inbyte += 1; - if (v >= key) { - *presult = v; - return i; - } - i++; - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c < 128) { - inbyte += 2; - if (v >= key) { - *presult = v; - return i; - } - i++; - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c < 128) { - inbyte += 3; - if (v >= key) { - *presult = v; - return i; - } - i++; - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c < 128) { - inbyte += 4; - if (v >= key) { - *presult = v; - return i; - } - i++; - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - if (v >= key) { - *presult = v; - return i; - } - i++; - } - } - while (endbyte > inbyte) { - unsigned int shift = 0; - for (uint32_t v = 0; endbyte > inbyte; shift += 7) { - uint8_t c = *inbyte++; - v += ((c & 127) << shift); - if ((c < 128)) { - if (delta) { - prev = v + prev; - if (prev >= key) { - *presult = prev; - return i; - } - } else { - if (v >= key) { - *presult = v; - return i; - } - } - i++; - break; - } - } - } - return i; - } - - // append a key. Keys must be in sorted order. We assume that there is - // enough room and that delta encoding was used. - /*size_t append(uint32_t *in, const size_t length, uint32_t previous_key, - uint32_t key) { - size_t bytesize = (length * 4) - paddingBytes(in, length); - uint8_t *byteininit = (uint8_t *)in; - uint8_t *bytein = (uint8_t *)in + bytesize; - bytein += encodeOneIntegerToByteArray(key - previous_key, bytein); - while (needPaddingTo32Bits(bytein)) { - *bytein++ = 0xFF; - } - size_t storageinbytes = bytein - byteininit; - assert((storageinbytes % 4) == 0); - return storageinbytes / 4; - }*/ - - - // append a key. Keys must be in sorted order. We assume that there is - // enough room and that delta encoding was used. - // Returns the new size of the compressed array *in bytes* - size_t appendToByteArray(uint8_t *in, const size_t bytesize, uint32_t previous_key, - uint32_t key) { - assert(delta);// no performance impact expected. - uint8_t *byteininit = (uint8_t *)in; - uint8_t *bytein = (uint8_t *)in + bytesize; - bytein += encodeOneIntegerToByteArray(key - previous_key, bytein); - return bytein - byteininit; - } - - // insert the key in sorted order. We assume that there is enough room - // and that delta encoding was used. - size_t insert(uint32_t *in, const size_t length, uint32_t key) { - assert(delta); - size_t bytesize = length * 4; - bytesize -= paddingBytes(in,length); - uint8_t * bytein = (uint8_t *) in; - uint8_t * byteininit = bytein; - bytein += insertInByteArray(bytein, bytesize, key); - - while (needPaddingTo32Bits(bytein)) { - *bytein++ = 0xFF; - } - size_t storageinbytes = bytein - byteininit; - assert((storageinbytes % 4) == 0); - return storageinbytes / 4; - } - - // insert the key in sorted order. We assume that there is enough room and that delta encoding was used. - // the new size (in *byte) is returned - size_t insertInByteArray(uint8_t *inbyte, const size_t length, uint32_t key) { - uint32_t prev = 0; - assert(delta); - const uint8_t * const endbyte = reinterpret_cast(inbyte - + length); - // this assumes that there is a value to be read - - while (endbyte > inbyte + 5) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c < 128) { - inbyte += 1; - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c < 128) { - inbyte += 2; - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c < 128) { - inbyte += 3; - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c < 128) { - inbyte += 4; - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - prev = v + prev; - if (prev >= key) { - return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); - } - } - while (endbyte > inbyte) { - unsigned int shift = 0; - for (uint32_t v = 0; endbyte > inbyte; shift += 7) { - uint8_t c = *inbyte++; - v += ((c & 127) << shift); - if ((c < 128)) { - prev = v + prev; - if (prev >= key) { - return length - + __insert(inbyte, prev - v, key, prev, - endbyte - inbyte); - - } - break; - } - } - } - // if we make it here, then we need to append - assert(key >= prev); - return length + encodeOneIntegerToByteArray(key - prev, inbyte); - } - - - - // Returns a decompressed value in an encoded array - // could be greatly optimized in the non-differential coding case: currently just for delta coding - uint32_t select(uint32_t *in, size_t index) { - assert(delta); - uint32_t prev = 0; - size_t i = 0; - const uint8_t *inbyte = reinterpret_cast(in); - - while(i <= index) { - uint8_t c; - uint32_t v; - - c = inbyte[0]; - v = c & 0x7F; - if (c < 128) { - inbyte += 1; - prev = v + prev; - ++i; - continue; - } - - c = inbyte[1]; - v |= (c & 0x7F) << 7; - if (c < 128) { - inbyte += 2; - prev = v + prev; - ++i; - continue; - } - - c = inbyte[2]; - v |= (c & 0x7F) << 14; - if (c < 128) { - inbyte += 3; - prev = v + prev; - ++i; - continue; - } - - c = inbyte[3]; - v |= (c & 0x7F) << 21; - if (c < 128) { - inbyte += 4; - prev = v + prev; - ++i; - continue; - } - - c = inbyte[4]; - inbyte += 5; - v |= (c & 0x0F) << 28; - prev = v + prev; - ++i; - } - assert(i == index + 1); - return prev; - - } - - std::string name() const { - if (delta) - return "VByteDelta"; - else - return "VByte"; - } - -private: - - // convenience function used by insert, writes key and newvalue to compressed stream, and return - // extra storage used, pointer should be right after where nextvalue is right now - size_t __insert(uint8_t *in, uint32_t previous, uint32_t key, uint32_t nextvalue, size_t followingbytes) { - assert(nextvalue >= key); - assert(key >= previous); - size_t oldstorage = storageCost(nextvalue - previous); - size_t newstorage = storageCost(nextvalue - key) + storageCost(key - previous); - assert(newstorage >= oldstorage); - if(newstorage > oldstorage) std::memmove(in + newstorage - oldstorage, in,followingbytes); - uint8_t *newin = in - oldstorage; - newin += encodeOneIntegerToByteArray(key - previous, newin); - newin += encodeOneIntegerToByteArray(nextvalue - key, newin); - assert(newin == in + newstorage - oldstorage); - return newstorage - oldstorage; - } - - - template - uint8_t extract7bits(const uint32_t val) { - return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); - } - - template - uint8_t extract7bitsmaskless(const uint32_t val) { - return static_cast((val >> (7 * i))); - } - -}; - - -} // namespace SIMDCompressionLib - -#endif /* SIMDCompressionAndIntersection_VARIABLEBYTE_H_ */ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#ifndef SIMDCompressionAndIntersection_VARIABLEBYTE_H_ +#define SIMDCompressionAndIntersection_VARIABLEBYTE_H_ +#include "common.h" +#include "codecs.h" +#include "util.h" + +namespace SIMDCompressionLib { + +/*** + * VariableByte and VByte are basically identical, except that + * one uses 0..0..0..1 to indicate 4 whereas the other one uses 1..1..1..0. + * The latter is maybe more common. + */ + +template class VariableByte : public IntegerCODEC { +public: + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + uint8_t *bout = reinterpret_cast(out); + const uint8_t *const initbout = reinterpret_cast(out); + size_t bytenvalue = nvalue * sizeof(uint32_t); + encodeToByteArray(in, length, bout, bytenvalue); + bout += bytenvalue; + while (needPaddingTo32Bits(bout)) { + *bout++ = 0; + } + const size_t storageinbytes = bout - initbout; + assert((storageinbytes % 4) == 0); + nvalue = storageinbytes / 4; + } + + // write one compressed integer (without differential coding) + // returns the number of bytes written + size_t encodeOneIntegerToByteArray(uint32_t val, uint8_t *bout) { + const uint8_t *const initbout = bout; + if (val < (1U << 7)) { + *bout = static_cast(val | (1U << 7)); + ++bout; + } else if (val < (1U << 14)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bitsmaskless<1>(val) | (1U << 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bitsmaskless<2>(val) | (1U << 7); + ++bout; + } else if (val < (1U << 28)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bits<2>(val); + ++bout; + *bout = extract7bitsmaskless<3>(val) | (1U << 7); + ++bout; + } else { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bits<2>(val); + ++bout; + *bout = extract7bits<3>(val); + ++bout; + *bout = extract7bitsmaskless<4>(val) | (1U << 7); + ++bout; + } + return bout - initbout; + } + + void encodeToByteArray(uint32_t *in, const size_t length, uint8_t *bout, + size_t &nvalue) { + const uint8_t *const initbout = bout; + uint32_t prev = 0; + for (size_t k = 0; k < length; ++k) { + const uint32_t val = delta ? in[k] - prev : in[k]; + if (delta) + prev = in[k]; + /** + * Code below could be shorter. Whether it could be faster + * depends on your compiler and machine. + */ + if (val < (1U << 7)) { + *bout = static_cast(val | (1U << 7)); + ++bout; + } else if (val < (1U << 14)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bitsmaskless<1>(val) | (1U << 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bitsmaskless<2>(val) | (1U << 7); + ++bout; + } else if (val < (1U << 28)) { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bits<2>(val); + ++bout; + *bout = extract7bitsmaskless<3>(val) | (1U << 7); + ++bout; + } else { + *bout = extract7bits<0>(val); + ++bout; + *bout = extract7bits<1>(val); + ++bout; + *bout = extract7bits<2>(val); + ++bout; + *bout = extract7bits<3>(val); + ++bout; + *bout = extract7bitsmaskless<4>(val) | (1U << 7); + ++bout; + } + } + nvalue = bout - initbout; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + decodeFromByteArray((const uint8_t *)in, length * sizeof(uint32_t), out, + nvalue); + return in + length; + } + + // determine how many padding bytes were used + int paddingBytes(const uint32_t *in, const size_t length) { + if (length == 0) + return 0; + uint32_t lastword = in[length - 1]; + if (lastword < (1U << 8)) { + return 3; + } else if (lastword < (1U << 16)) { + return 2; + } else if (lastword < (1U << 24)) { + return 1; + } + return 0; + } + + // how many bytes are required to store this integer? + int storageCost(uint32_t val) { + if (val < (1U << 7)) { + return 1; + } else if (val < (1U << 14)) { + return 2; + } else if (val < (1U << 21)) { + return 3; + } else if (val < (1U << 28)) { + return 4; + } else { + return 5; + } + } + + const uint8_t *decodeFromByteArray(const uint8_t *inbyte, const size_t length, + uint32_t *out, size_t &nvalue) { + uint32_t prev = 0; + if (length == 0) { + nvalue = 0; + return inbyte; // abort + } + const uint8_t *const endbyte = inbyte + length; + const uint32_t *const initout(out); + // this assumes that there is a value to be read + + while (endbyte > inbyte + 5) { + if (delta) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c >= 128) { + inbyte += 1; + *out++ = (prev = v + prev); + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c >= 128) { + inbyte += 2; + *out++ = (prev = v + prev); + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c >= 128) { + inbyte += 3; + *out++ = (prev = v + prev); + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c >= 128) { + inbyte += 4; + *out++ = (prev = v + prev); + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + *out++ = (prev = v + prev); + } else { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c >= 128) { + inbyte += 1; + *out++ = v; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c >= 128) { + inbyte += 2; + *out++ = v; + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c >= 128) { + inbyte += 3; + *out++ = v; + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c >= 128) { + inbyte += 4; + *out++ = v; + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + *out++ = v; + } + } + while (endbyte > inbyte) { + unsigned int shift = 0; + for (uint32_t v = 0; endbyte > inbyte; shift += 7) { + uint8_t c = *inbyte++; + v += ((c & 127) << shift); + if ((c & 128)) { + *out++ = delta ? (prev = v + prev) : v; + break; + } + } + } + nvalue = out - initout; + return inbyte; + } + + // Performs a lower bound find in the encoded array. + // length is the size of the compressed input + // Returns the index + size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, + uint32_t *presult) { + uint32_t prev = 0; + if (length == 0) { + return 0; // abort + } + const uint8_t *inbyte = reinterpret_cast(in); + const uint8_t *const endbyte = + reinterpret_cast(in + length); + size_t i = 0; + // this assumes that there is a value to be read + + while (endbyte > inbyte + 5) { + if (delta) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c >= 128) { + inbyte += 1; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c >= 128) { + inbyte += 2; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c >= 128) { + inbyte += 3; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c >= 128) { + inbyte += 4; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + } else { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c >= 128) { + inbyte += 1; + if (v >= key) { + *presult = v; + return i; + } + i++; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c >= 128) { + inbyte += 2; + if (v >= key) { + *presult = v; + return i; + } + i++; + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c >= 128) { + inbyte += 3; + if (v >= key) { + *presult = v; + return i; + } + i++; + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c >= 128) { + inbyte += 4; + if (v >= key) { + *presult = v; + return i; + } + i++; + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + if (v >= key) { + *presult = v; + return i; + } + i++; + } + } + while (endbyte > inbyte) { + unsigned int shift = 0; + for (uint32_t v = 0; endbyte > inbyte; shift += 7) { + uint8_t c = *inbyte++; + v += ((c & 127) << shift); + if ((c & 128)) { + if (delta) { + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + } else { + if (v >= key) { + *presult = v; + return i; + } + } + i++; + break; + } + } + } + return i; + } + + // append a key. Keys must be in sorted order. We assume that there is + // enough room and that delta encoding was used. + // Returns the new size of the compressed array *in bytes* + size_t appendToByteArray(uint8_t *in, const size_t bytesize, + uint32_t previous_key, uint32_t key) { + assert(delta); // no performance impact expected. + uint8_t *byteininit = (uint8_t *)in; + uint8_t *bytein = (uint8_t *)in + bytesize; + bytein += encodeOneIntegerToByteArray(key - previous_key, bytein); + return bytein - byteininit; + } + + // insert the key in sorted order. We assume that there is enough room and + // that delta encoding was used. + size_t insert(uint32_t *in, const size_t length, uint32_t key) { + size_t bytesize = length * 4; + bytesize -= paddingBytes(in, length); + uint8_t *bytein = (uint8_t *)in; + uint8_t *byteininit = bytein; + size_t bl = insertInByteArray(bytein, bytesize, key); + bytein += bl; + + while (needPaddingTo32Bits(bytein)) { + *bytein++ = 0; + } + size_t storageinbytes = bytein - byteininit; + assert((storageinbytes % 4) == 0); + return storageinbytes / 4; + } + + // insert the key in sorted order. We assume that there is enough room and + // that delta encoding was used. + // the new size is returned + size_t insertInByteArray(uint8_t *inbyte, const size_t length, uint32_t key) { + uint32_t prev = 0; + assert(delta); + const uint8_t *const endbyte = + reinterpret_cast(inbyte + length); + // this assumes that there is a value to be read + + while (endbyte > inbyte + 5) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c >= 128) { + inbyte += 1; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c >= 128) { + inbyte += 2; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c >= 128) { + inbyte += 3; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c >= 128) { + inbyte += 4; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + prev = v + prev; + if (prev >= key) { + return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + } + while (endbyte > inbyte) { + unsigned int shift = 0; + for (uint32_t v = 0; endbyte > inbyte; shift += 7) { + uint8_t c = *inbyte++; + v += ((c & 127) << shift); + if ((c & 128)) { + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + break; + } + } + } + // if we make it here, then we need to append + assert(key >= prev); + return length + encodeOneIntegerToByteArray(key - prev, inbyte); + } + + // Returns a decompressed value in an encoded array + // could be greatly optimized in the non-differential coding case: currently + // just for delta coding + uint32_t select(uint32_t *in, size_t index) { + assert(delta); + uint32_t prev = 0; + size_t i = 0; + const uint8_t *inbyte = reinterpret_cast(in); + while (i <= index) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c >= 128) { + inbyte += 1; + prev = v + prev; + i++; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c >= 128) { + inbyte += 2; + prev = v + prev; + i++; + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c >= 128) { + inbyte += 3; + prev = v + prev; + i++; + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c >= 128) { + inbyte += 4; + prev = v + prev; + i++; + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + prev = v + prev; + i++; + } + assert(i == index + 1); + return prev; + } + + string name() const { + if (delta) + return "VariableByteDelta"; + else + return "VariableByte"; + } + +private: + // convenience function used by insert, writes key and newvalue to compressed + // stream, and return + // extra storage used, pointer should be right after where nextvalue is right + // now + size_t __insert(uint8_t *in, uint32_t previous, uint32_t key, + uint32_t nextvalue, size_t followingbytes) { + assert(nextvalue >= key); + assert(key >= previous); + size_t oldstorage = storageCost(nextvalue - previous); + size_t newstorage = + storageCost(nextvalue - key) + storageCost(key - previous); + assert(newstorage >= oldstorage); + if (newstorage > oldstorage) + std::memmove(in + newstorage - oldstorage, in, followingbytes); + uint8_t *newin = in - oldstorage; + newin += encodeOneIntegerToByteArray(key - previous, newin); + newin += encodeOneIntegerToByteArray(nextvalue - key, newin); + assert(newin == in + newstorage - oldstorage); + return newstorage - oldstorage; + } + + template uint8_t extract7bits(const uint32_t val) { + return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); + } + + template uint8_t extract7bitsmaskless(const uint32_t val) { + return static_cast((val >> (7 * i))); + } +}; + +template class VByte : public IntegerCODEC { +public: + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + uint8_t *bout = reinterpret_cast(out); + const uint8_t *const initbout = reinterpret_cast(out); + size_t bytenvalue = nvalue * sizeof(uint32_t); + encodeToByteArray(in, length, bout, bytenvalue); + bout += bytenvalue; + while (needPaddingTo32Bits(bout)) { + *bout++ = 0xFF; + } + const size_t storageinbytes = bout - initbout; + assert((storageinbytes % 4) == 0); + nvalue = storageinbytes / 4; + } + + void encodeToByteArray(uint32_t *in, const size_t length, uint8_t *bout, + size_t &nvalue) { + uint32_t prev = 0; + const uint8_t *const initbout = bout; + for (size_t k = 0; k < length; ++k) { + const uint32_t val = delta ? (in[k] - prev) : in[k]; + if (delta) + prev = in[k]; + /** + * Code below could be shorter. Whether it could be faster + * depends on your compiler and machine. + */ + if (val < (1U << 7)) { + *bout = val & 0x7F; + ++bout; + } else if (val < (1U << 14)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 14); + ++bout; + } else if (val < (1U << 28)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 21); + ++bout; + } else { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 28); + ++bout; + } + } + nvalue = bout - initbout; + } + + // write one compressed integer (without differential coding) + // returns the number of bytes written + size_t encodeOneIntegerToByteArray(uint32_t val, uint8_t *bout) { + const uint8_t *const initbout = bout; + if (val < (1U << 7)) { + *bout = val & 0x7F; + ++bout; + } else if (val < (1U << 14)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 14); + ++bout; + } else if (val < (1U << 28)) { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 21); + ++bout; + } else { + *bout = static_cast((val & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 7) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 14) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(((val >> 21) & 0x7F) | (1U << 7)); + ++bout; + *bout = static_cast(val >> 28); + ++bout; + } + return bout - initbout; + } + + // determine how many padding bytes were used + int paddingBytes(const uint32_t *in, const size_t length) { + if (length == 0) + return 0; + uint32_t lastword = in[length - 1]; + lastword = ~lastword; + if (lastword < (1U << 8)) { + return 3; + } else if (lastword < (1U << 16)) { + return 2; + } else if (lastword < (1U << 24)) { + return 1; + } + return 0; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + decodeFromByteArray((const uint8_t *)in, length * sizeof(uint32_t), out, + nvalue); + return in + length; + } + + // how many bytes are required to store this integer? + int storageCost(uint32_t val) { + if (val < (1U << 7)) { + return 1; + } else if (val < (1U << 14)) { + return 2; + } else if (val < (1U << 21)) { + return 3; + } else if (val < (1U << 28)) { + return 4; + } else { + return 5; + } + } + + const uint8_t *decodeFromByteArray(const uint8_t *inbyte, const size_t length, + uint32_t *out, size_t &nvalue) { + uint32_t prev = 0; + if (length == 0) { + nvalue = 0; + return inbyte; // abort + } + const uint8_t *const endbyte = inbyte + length; + const uint32_t *const initout(out); + // this assumes that there is a value to be read + + while (endbyte > inbyte + 5) { + if (delta) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c < 128) { + inbyte += 1; + *out++ = (prev = v + prev); + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c < 128) { + inbyte += 2; + *out++ = (prev = v + prev); + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c < 128) { + inbyte += 3; + *out++ = (prev = v + prev); + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c < 128) { + inbyte += 4; + *out++ = (prev = v + prev); + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + *out++ = (prev = v + prev); + } else { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c < 128) { + inbyte += 1; + *out++ = v; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c < 128) { + inbyte += 2; + *out++ = v; + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c < 128) { + inbyte += 3; + *out++ = v; + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c < 128) { + inbyte += 4; + *out++ = v; + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + *out++ = v; + } + } + while (endbyte > inbyte) { + unsigned int shift = 0; + for (uint32_t v = 0; endbyte > inbyte; shift += 7) { + uint8_t c = *inbyte++; + v += ((c & 127) << shift); + if ((c < 128)) { + *out++ = delta ? (prev = v + prev) : v; + break; + } + } + } + nvalue = out - initout; + return inbyte; + } + + // Performs a lower bound find in the encoded array. + // length is the size of the compressed input + // Returns the index and the value found (presult) + size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, + uint32_t *presult) { + uint32_t prev = 0; + if (length == 0) { + return 0; // abort + } + size_t i = 0; + const uint8_t *inbyte = reinterpret_cast(in); + const uint8_t *const endbyte = + reinterpret_cast(in + length); + // this assumes that there is a value to be read + + while (endbyte > inbyte + 5) { + if (delta) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c < 128) { + inbyte += 1; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c < 128) { + inbyte += 2; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c < 128) { + inbyte += 3; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c < 128) { + inbyte += 4; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + i++; + } else { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c < 128) { + inbyte += 1; + if (v >= key) { + *presult = v; + return i; + } + i++; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c < 128) { + inbyte += 2; + if (v >= key) { + *presult = v; + return i; + } + i++; + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c < 128) { + inbyte += 3; + if (v >= key) { + *presult = v; + return i; + } + i++; + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c < 128) { + inbyte += 4; + if (v >= key) { + *presult = v; + return i; + } + i++; + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + if (v >= key) { + *presult = v; + return i; + } + i++; + } + } + while (endbyte > inbyte) { + unsigned int shift = 0; + for (uint32_t v = 0; endbyte > inbyte; shift += 7) { + uint8_t c = *inbyte++; + v += ((c & 127) << shift); + if ((c < 128)) { + if (delta) { + prev = v + prev; + if (prev >= key) { + *presult = prev; + return i; + } + } else { + if (v >= key) { + *presult = v; + return i; + } + } + i++; + break; + } + } + } + return i; + } + + // append a key. Keys must be in sorted order. We assume that there is + // enough room and that delta encoding was used. + /*size_t append(uint32_t *in, const size_t length, uint32_t previous_key, + uint32_t key) { + size_t bytesize = (length * 4) - paddingBytes(in, length); + uint8_t *byteininit = (uint8_t *)in; + uint8_t *bytein = (uint8_t *)in + bytesize; + bytein += encodeOneIntegerToByteArray(key - previous_key, bytein); + while (needPaddingTo32Bits(bytein)) { + *bytein++ = 0xFF; + } + size_t storageinbytes = bytein - byteininit; + assert((storageinbytes % 4) == 0); + return storageinbytes / 4; + }*/ + + // append a key. Keys must be in sorted order. We assume that there is + // enough room and that delta encoding was used. + // Returns the new size of the compressed array *in bytes* + size_t appendToByteArray(uint8_t *in, const size_t bytesize, + uint32_t previous_key, uint32_t key) { + assert(delta); // no performance impact expected. + uint8_t *byteininit = (uint8_t *)in; + uint8_t *bytein = (uint8_t *)in + bytesize; + bytein += encodeOneIntegerToByteArray(key - previous_key, bytein); + return bytein - byteininit; + } + + // insert the key in sorted order. We assume that there is enough room + // and that delta encoding was used. + size_t insert(uint32_t *in, const size_t length, uint32_t key) { + assert(delta); + size_t bytesize = length * 4; + bytesize -= paddingBytes(in, length); + uint8_t *bytein = (uint8_t *)in; + uint8_t *byteininit = bytein; + bytein += insertInByteArray(bytein, bytesize, key); + + while (needPaddingTo32Bits(bytein)) { + *bytein++ = 0xFF; + } + size_t storageinbytes = bytein - byteininit; + assert((storageinbytes % 4) == 0); + return storageinbytes / 4; + } + + // insert the key in sorted order. We assume that there is enough room and + // that delta encoding was used. + // the new size (in *byte) is returned + size_t insertInByteArray(uint8_t *inbyte, const size_t length, uint32_t key) { + uint32_t prev = 0; + assert(delta); + const uint8_t *const endbyte = + reinterpret_cast(inbyte + length); + // this assumes that there is a value to be read + + while (endbyte > inbyte + 5) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c < 128) { + inbyte += 1; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c < 128) { + inbyte += 2; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c < 128) { + inbyte += 3; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c < 128) { + inbyte += 4; + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + prev = v + prev; + if (prev >= key) { + return length + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + } + while (endbyte > inbyte) { + unsigned int shift = 0; + for (uint32_t v = 0; endbyte > inbyte; shift += 7) { + uint8_t c = *inbyte++; + v += ((c & 127) << shift); + if ((c < 128)) { + prev = v + prev; + if (prev >= key) { + return length + + __insert(inbyte, prev - v, key, prev, endbyte - inbyte); + } + break; + } + } + } + // if we make it here, then we need to append + assert(key >= prev); + return length + encodeOneIntegerToByteArray(key - prev, inbyte); + } + + // Returns a decompressed value in an encoded array + // could be greatly optimized in the non-differential coding case: currently + // just for delta coding + uint32_t select(uint32_t *in, size_t index) { + assert(delta); + uint32_t prev = 0; + size_t i = 0; + const uint8_t *inbyte = reinterpret_cast(in); + + while (i <= index) { + uint8_t c; + uint32_t v; + + c = inbyte[0]; + v = c & 0x7F; + if (c < 128) { + inbyte += 1; + prev = v + prev; + ++i; + continue; + } + + c = inbyte[1]; + v |= (c & 0x7F) << 7; + if (c < 128) { + inbyte += 2; + prev = v + prev; + ++i; + continue; + } + + c = inbyte[2]; + v |= (c & 0x7F) << 14; + if (c < 128) { + inbyte += 3; + prev = v + prev; + ++i; + continue; + } + + c = inbyte[3]; + v |= (c & 0x7F) << 21; + if (c < 128) { + inbyte += 4; + prev = v + prev; + ++i; + continue; + } + + c = inbyte[4]; + inbyte += 5; + v |= (c & 0x0F) << 28; + prev = v + prev; + ++i; + } + assert(i == index + 1); + return prev; + } + + std::string name() const { + if (delta) + return "VByteDelta"; + else + return "VByte"; + } + +private: + // convenience function used by insert, writes key and newvalue to compressed + // stream, and return + // extra storage used, pointer should be right after where nextvalue is right + // now + size_t __insert(uint8_t *in, uint32_t previous, uint32_t key, + uint32_t nextvalue, size_t followingbytes) { + assert(nextvalue >= key); + assert(key >= previous); + size_t oldstorage = storageCost(nextvalue - previous); + size_t newstorage = + storageCost(nextvalue - key) + storageCost(key - previous); + assert(newstorage >= oldstorage); + if (newstorage > oldstorage) + std::memmove(in + newstorage - oldstorage, in, followingbytes); + uint8_t *newin = in - oldstorage; + newin += encodeOneIntegerToByteArray(key - previous, newin); + newin += encodeOneIntegerToByteArray(nextvalue - key, newin); + assert(newin == in + newstorage - oldstorage); + return newstorage - oldstorage; + } + + template uint8_t extract7bits(const uint32_t val) { + return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); + } + + template uint8_t extract7bitsmaskless(const uint32_t val) { + return static_cast((val >> (7 * i))); + } +}; + +} // namespace SIMDCompressionLib + +#endif /* SIMDCompressionAndIntersection_VARIABLEBYTE_H_ */ diff --git a/include/varintgb.h b/include/varintgb.h index 166fcb8..769b75b 100644 --- a/include/varintgb.h +++ b/include/varintgb.h @@ -16,26 +16,21 @@ namespace SIMDCompressionLib { using namespace std; - - static uint8_t group_size[] = { - 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, - 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, - 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, - 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, - 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, - 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, - 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, - 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, - 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, - 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, - 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, - 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, - 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16 - }; + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6, 7, + 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, + 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, + 10, 11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, + 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, + 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, + 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, + 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, + 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, + 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, + 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, + 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, + 14, 12, 13, 14, 15, 13, 14, 15, 16}; /** * Group VarInt. @@ -44,888 +39,891 @@ static uint8_t group_size[] = { * optimized by N. Kurz. */ -template -class VarIntGB: public IntegerCODEC { +template class VarIntGB : public IntegerCODEC { public: - - void encodeArray(uint32_t *in, const size_t length, uint32_t *out, - size_t &nvalue) { - uint8_t * bout = reinterpret_cast (out); - const uint8_t * const initbout = reinterpret_cast (out); - *out = static_cast(length); - bout += 4; - bout = headlessEncode(in,length,0,bout); - - while (needPaddingTo32Bits(bout)) { - *bout++ = 0; - } - const size_t storageinbytes = bout - initbout; - assert((storageinbytes % 4) == 0); - nvalue = storageinbytes / 4; + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + uint8_t *bout = reinterpret_cast(out); + const uint8_t *const initbout = reinterpret_cast(out); + *out = static_cast(length); + bout += 4; + bout = headlessEncode(in, length, 0, bout); + + while (needPaddingTo32Bits(bout)) { + *bout++ = 0; } - - void encodeToByteArray(uint32_t *in, const size_t length, uint8_t *bout, - size_t &nvalue) { - const uint8_t *const initbout = bout; - *(uint32_t *)bout = static_cast(length); - bout += 4; - bout = headlessEncode(in, length, 0, bout); - nvalue = bout - initbout; + const size_t storageinbytes = bout - initbout; + assert((storageinbytes % 4) == 0); + nvalue = storageinbytes / 4; + } + + void encodeToByteArray(uint32_t *in, const size_t length, uint8_t *bout, + size_t &nvalue) { + const uint8_t *const initbout = bout; + *(uint32_t *)bout = static_cast(length); + bout += 4; + bout = headlessEncode(in, length, 0, bout); + nvalue = bout - initbout; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + if (length == 0) { + nvalue = 0; + return in; } - - const uint32_t *decodeArray(const uint32_t *in, const size_t length, - uint32_t *out, size_t & nvalue) { - if(length == 0) { - nvalue = 0; - return in; - } - const uint8_t * inbyte = reinterpret_cast (in); - nvalue = *in; - inbyte += 4; - size_t decoded = headlessDecode(inbyte, (length - 1 )* sizeof(uint32_t),0,out,nvalue); - assert(decoded == nvalue); - return in + length; + const uint8_t *inbyte = reinterpret_cast(in); + nvalue = *in; + inbyte += 4; + size_t decoded = + headlessDecode(inbyte, (length - 1) * sizeof(uint32_t), 0, out, nvalue); + assert(decoded == nvalue); + return in + length; + } + + const uint8_t *decodeFromByteArray(const uint8_t *inbyte, const size_t length, + uint32_t *out, size_t &nvalue) { + if (length == 0) { + nvalue = 0; + return inbyte; } - - const uint8_t *decodeFromByteArray(const uint8_t *inbyte, - const size_t length, uint32_t *out, - size_t & nvalue) { - if (length == 0) { - nvalue = 0; - return inbyte; - } - nvalue = *(uint32_t *)inbyte; - inbyte += 4; - size_t decoded = headlessDecode(inbyte, length - 4, 0, out, nvalue); - assert(decoded == nvalue); - return inbyte + length; + nvalue = *(uint32_t *)inbyte; + inbyte += 4; + size_t decoded = headlessDecode(inbyte, length - 4, 0, out, nvalue); + assert(decoded == nvalue); + return inbyte + length; + } + + // appends a key + // return the new size in bytes + size_t appendToByteArray(uint8_t *in, const size_t length, uint32_t previous, + uint32_t value) { + uint32_t num_ints = *(uint32_t *)in; + uint8_t *bout = reinterpret_cast(in + 4); + uint8_t *bend = in + (length == 0 ? 4 : length); + + if (delta) + value -= previous; + + uint8_t *keyp; + int shift; + + // fast-forward to the last block + if (num_ints % 4 != 0) { + uint32_t size = 0; + do { + bout += size; + size = 1 + group_size[*bout]; + } while (bout + size < bend); + keyp = bout; + bout = bend; + shift = (num_ints % 4) * 2; + } else { + keyp = bend; + bout = keyp + 1; + *keyp = 0; + shift = 0; } - // appends a key - // return the new size in bytes - size_t appendToByteArray(uint8_t *in, const size_t length, uint32_t previous, - uint32_t value) { - uint32_t num_ints = *(uint32_t *)in; - uint8_t *bout = reinterpret_cast (in + 4); - uint8_t *bend = in + (length == 0 ? 4 : length); + if (value < (1U << 8)) { + *bout++ = static_cast(value); + } else if (value < (1U << 16)) { + *bout++ = static_cast(value); + *bout++ = static_cast(value >> 8); + *keyp |= (1 << shift); + } else if (value < (1U << 24)) { + *bout++ = static_cast(value); + *bout++ = static_cast(value >> 8); + *bout++ = static_cast(value >> 16); + *keyp |= (2 << shift); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = value; + bout += 4; + *keyp |= (3 << shift); + } - if (delta) - value -= previous; - - uint8_t *keyp; - int shift; - - // fast-forward to the last block - if (num_ints % 4 != 0) { - uint32_t size = 0; - do { - bout += size; - size = 1 + group_size[*bout]; - } while (bout + size < bend); - keyp = bout; - bout = bend; - shift = (num_ints % 4) * 2; + *(uint32_t *)in = num_ints + 1; + return bout - in; + } + + // insert the key in sorted order. We assume that there is enough room and + // that delta encoding was used. + size_t insert(uint32_t *in, const size_t length, uint32_t key) { + size_t bytesize = length * 4; + uint8_t *bytein = (uint8_t *)in; + uint8_t *byteininit = bytein; + size_t bl = insertInByteArray(bytein, bytesize, key); + bytein += bl; + + while (needPaddingTo32Bits(bytein)) { + *bytein++ = 0; + } + size_t storageinbytes = bytein - byteininit; + assert((storageinbytes % 4) == 0); + return storageinbytes / 4; + } + + // insert the key in sorted order. We assume that there is enough room and + // that delta encoding was used. + // the new size is returned (in bytes) + size_t insertInByteArray(uint8_t *inbyte, size_t length, uint32_t key) { + if (length == 0) { + *((uint32_t *)inbyte) = 0; + length = 4; + } + uint8_t *finalinbyte = inbyte + length; + const uint8_t *const initinbyte = inbyte; + uint32_t nvalue = *((uint32_t *)inbyte); + *((uint32_t *)inbyte) = nvalue + 1; // incrementing + inbyte += 4; // skip nvalue + assert(delta); + uint32_t initial = 0; + size_t i = 0; + while (i + 3 < nvalue) { + uint32_t copyinitial = initial; + const uint8_t *const newinbyte = + scanGroupVarIntDelta(inbyte, ©initial); + if (copyinitial >= key) { + goto finish; + } + inbyte = (uint8_t *)newinbyte; + initial = copyinitial; + i += 4; + } + finish: + assert(finalinbyte >= inbyte); + assert(i <= nvalue); + static const int REASONABLEBUFFER = 256; + if (nvalue - i + 1 > REASONABLEBUFFER) { + if (nvalue == i) { // straight append + const uint8_t *const newfinalinbyte = + headlessEncode(&key, 1, initial, inbyte); + return newfinalinbyte - initinbyte; + } + if (nvalue - i <= 4) { + // easy case + uint32_t tmpbuffer[5]; + tmpbuffer[0] = key; + size_t decoded = headlessDecode(inbyte, finalinbyte - inbyte, initial, + tmpbuffer + 1, nvalue - i); + assert(decoded == nvalue - i); + sortinfirstvalue(tmpbuffer, nvalue - i); + const uint8_t *const newfinalinbyte = + headlessEncode(tmpbuffer, nvalue - i + 1, initial, inbyte); + return newfinalinbyte - initinbyte; + } + // harder case + // this part is a bit complicated since we need to merge in the key + uint32_t readinitial = initial; + uint32_t tmpbuffer[5]; + tmpbuffer[0] = key; + const uint8_t *readinginbyte = + decodeGroupVarIntDelta(inbyte, &readinitial, tmpbuffer + 1); + assert(tmpbuffer[4] >= key); + assert(readinginbyte > inbyte); + + sortinfirstvalue(tmpbuffer, nvalue - i); + i += 4; + + // initialize blocks + + Block b1, b2; + + Block *block1 = &b1; + Block *block2 = &b2; + Block *blocktmp; + + // load block1 + + uint8_t *fb = encodeGroupVarIntDelta(block1->data, initial, tmpbuffer); + + block1->length = static_cast(fb - block1->data); + uint32_t nextval = tmpbuffer[4] - tmpbuffer[3]; + uint32_t newsel = getByteLength(nextval) - 1; + // everything after that is just going to be shifting + while (nvalue - i >= 4) { + + // load block 2 + assert(readinginbyte >= inbyte); + readinginbyte = loadblock(block2, readinginbyte); + i += 4; + // shift in block 1 + shiftin(block2, &nextval, &newsel); + // write block1 + memcpy(inbyte, block1->data, block1->length); + inbyte += block1->length; + // block1 = block2 + blocktmp = block1; + block1 = block2; + block2 = blocktmp; + } + if (nvalue != i) { + readinginbyte = loadblockcarefully(block2, readinginbyte, nvalue - i); + finalshiftin(block2, nextval, newsel, + nvalue - i + 1); // nextval is useless here + memcpy(inbyte, block1->data, block1->length); + inbyte += block1->length; + memcpy(inbyte, block2->data, block2->length); + inbyte += block2->length; + return inbyte - initinbyte; + } else { + memcpy(inbyte, block1->data, block1->length); + inbyte += block1->length; + inbyte[0] = newsel; + inbyte++; + memcpy(inbyte, &nextval, newsel + 1); + inbyte += newsel + 1; + return inbyte - initinbyte; + } + // we are using brute force here, by decoding everything to a buffer and + // then reencoding. + } else { + uint32_t tmpbuffer[REASONABLEBUFFER]; + assert(tmpbuffer); + tmpbuffer[0] = key; + if (nvalue != i) { + size_t decoded = headlessDecode(inbyte, finalinbyte - inbyte, initial, + tmpbuffer + 1, nvalue - i); + assert(decoded == nvalue - i); + sortinfirstvalue(tmpbuffer, nvalue - i); + } + const uint8_t *const newfinalinbyte = + headlessEncode(tmpbuffer, nvalue - i + 1, initial, inbyte); + return newfinalinbyte - initinbyte; + } + } + + // Performs a lower bound find in the encoded array. + // Returns the index + // assumes delta coding was used + size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, + uint32_t *presult) { + const uint8_t *inbyte = reinterpret_cast(in); + uint32_t out[4] = {0}; + assert(delta); + size_t i = 0; + uint32_t initial = 0; + uint32_t nvalue = *in; + + inbyte += 4; // skip nvalue + + const uint8_t *const endbyte = + reinterpret_cast(in + length); + while (i + 3 < nvalue) { + uint32_t gap1, gap2, gap3, gap4; + uint32_t gap12, gap34; + + const uint32_t sel = *inbyte++; + if (sel == 0) { + gap1 = static_cast(inbyte[0]); + gap2 = static_cast(inbyte[1]); + gap12 = gap1 + gap2; + gap3 = static_cast(inbyte[2]); + gap4 = static_cast(inbyte[3]); + gap34 = gap3 + gap4; + inbyte += 4; + } else { + const uint32_t sel1 = (sel & 3); + gap1 = *(reinterpret_cast(inbyte)) & mask[sel1]; + inbyte += sel1 + 1; + const uint32_t sel2 = ((sel >> 2) & 3); + gap2 = *(reinterpret_cast(inbyte)) & mask[sel2]; + gap12 = gap1 + gap2; + inbyte += sel2 + 1; + const uint32_t sel3 = ((sel >> 4) & 3); + gap3 = *(reinterpret_cast(inbyte)) & mask[sel3]; + inbyte += sel3 + 1; + const uint32_t sel4 = (sel >> 6); + gap4 = *(reinterpret_cast(inbyte)) & mask[sel4]; + gap34 = gap3 + gap4; + inbyte += sel4 + 1; + } + initial += gap12 + gap34; + if (key <= initial) { + if (key <= initial - gap34 - gap2) { + *presult = initial - gap34 - gap2; + return (i + 0); } - else { - keyp = bend; - bout = keyp + 1; - *keyp = 0; - shift = 0; + if (key <= initial - gap34) { + *presult = initial - gap34; + return (i + 1); } - - if (value < (1U << 8)) { - *bout++ = static_cast (value); + if (key <= initial - gap4) { + *presult = initial - gap4; + return (i + 2); } - else if (value < (1U << 16)) { - *bout++ = static_cast (value); - *bout++ = static_cast (value >> 8); - *keyp |= (1 << shift); + *presult = initial; + return (i + 3); + } + i += 4; + } + if (endbyte > inbyte && nvalue > i) { + uint32_t tnvalue = static_cast(nvalue - 1 - i); + inbyte = decodeCarefully(inbyte, &initial, out, tnvalue); + assert(inbyte <= endbyte); + if (key <= out[0]) { + *presult = out[0]; + return (i + 0); + } + if (tnvalue > 0 && key <= out[1]) { + *presult = out[1]; + return (i + 1); + } + if (tnvalue > 1 && key <= out[2]) { + *presult = out[2]; + return (i + 2); + } + if (tnvalue > 2 && key <= out[3]) { + *presult = out[3]; + return (i + 3); + } + } + assert(false); + *presult = key + 1; + return (i); + } + + // Returns a decompressed value in an encoded array + // This code has been optimized for delta-encoded arrays (TODO: optimize for + // the regular case). + uint32_t select(uint32_t *in, size_t index) { + const uint8_t *inbyte = reinterpret_cast(in); + uint32_t out[4]; + out[0] = 0; + out[1] = 0; + out[2] = 0; + out[3] = 0; + size_t i = 0; + uint32_t initial = 0; + uint32_t nvalue = *in; + inbyte += 4; // skip nvalue + if (index + 3 < + nvalue) { // this common case can be done with fewer branches + while (i + 4 <= index) { + inbyte = delta ? scanGroupVarIntDelta(inbyte, &initial) + : scanGroupVarInt(inbyte); // note: delta known at + // compile time: this is not a + // branch + i += 4; + } + inbyte = delta ? decodeGroupVarIntDelta(inbyte, &initial, out) + : decodeGroupVarInt(inbyte, out); // note: delta known at + // compile time: this is + // not a branch + return (out[index - i]); + } // else + // we finish with the uncommon case + while (i + 3 < index) { // a single branch will do for this case (bulk of + // the computation) + inbyte = delta ? scanGroupVarIntDelta(inbyte, &initial) + : scanGroupVarInt(inbyte); + i += 4; + } + // lots of branching ahead... + while (i + 3 < nvalue) { + inbyte = delta ? decodeGroupVarIntDelta(inbyte, &initial, out) + : decodeGroupVarInt(inbyte, out); + i += 4; + if (i > index) + return (out[index - (i - 4)]); + } + { + nvalue = static_cast(nvalue - i); + inbyte = decodeCarefully(inbyte, &initial, out, nvalue); + if (index == i) + return (out[0]); + if (nvalue > 1 && index == i + 1) + return (out[1]); + if (nvalue > 2 && index == i + 2) + return (out[2]); + if (nvalue > 3 && index == i + 3) + return (out[3]); + } + assert(false); // we should never get here + return (0); + } + + string name() const { + if (delta) + return "varintgbdelta"; + else + return "varintgb"; + } + + uint8_t *headlessEncode(uint32_t *in, const size_t length, uint32_t prev, + uint8_t *bout) { + size_t k = 0; + for (; k + 3 < length; k += 4) { + uint8_t *keyp = bout++; + *keyp = 0; + { + const uint32_t val = delta ? in[k] - prev : in[k]; + if (delta) + prev = in[k]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp = static_cast(1); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp = static_cast(2); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp = static_cast(3); } - else if (value < (1U << 24)) { - *bout++ = static_cast (value); - *bout++ = static_cast (value >> 8); - *bout++ = static_cast (value >> 16); - *keyp |= (2 << shift); + } + { + const uint32_t val = delta ? in[k + 1] - prev : in[k + 1]; + if (delta) + prev = in[k + 1]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp |= static_cast(1 << 2); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp |= static_cast(2 << 2); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp |= static_cast(3 << 2); } - else { - // the compiler will do the right thing - *reinterpret_cast (bout) = value; - bout += 4; - *keyp |= (3 << shift); + } + { + const uint32_t val = delta ? in[k + 2] - prev : in[k + 2]; + if (delta) + prev = in[k + 2]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp |= static_cast(1 << 4); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp |= static_cast(2 << 4); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp |= static_cast(3 << 4); } - - *(uint32_t *)in = num_ints + 1; - return bout - in; - } - - // insert the key in sorted order. We assume that there is enough room and that delta encoding was used. - size_t insert(uint32_t *in, const size_t length, uint32_t key) { - size_t bytesize = length * 4; - uint8_t * bytein = (uint8_t *) in; - uint8_t * byteininit = bytein; - size_t bl = insertInByteArray(bytein, bytesize, key); - bytein += bl; - - while (needPaddingTo32Bits(bytein)) { - *bytein++ = 0; + } + { + const uint32_t val = delta ? in[k + 3] - prev : in[k + 3]; + if (delta) + prev = in[k + 3]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp |= static_cast(1 << 6); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp |= static_cast(2 << 6); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp |= static_cast(3 << 6); } - size_t storageinbytes = bytein - byteininit; - assert((storageinbytes % 4) == 0); - return storageinbytes / 4; + } } - - // insert the key in sorted order. We assume that there is enough room and that delta encoding was used. - // the new size is returned (in bytes) - size_t insertInByteArray(uint8_t *inbyte, size_t length, uint32_t key) { - if(length == 0) { - *((uint32_t *) inbyte) = 0; - length = 4; - } - uint8_t * finalinbyte = inbyte + length; - const uint8_t * const initinbyte = inbyte; - uint32_t nvalue = *((uint32_t *) inbyte); - *((uint32_t *) inbyte) = nvalue + 1; // incrementing - inbyte += 4; // skip nvalue - assert(delta); - uint32_t initial = 0; - size_t i = 0; - while (i + 3 < nvalue) { - uint32_t copyinitial = initial; - const uint8_t * const newinbyte = scanGroupVarIntDelta(inbyte, ©initial); - if (copyinitial >= key) { - goto finish; - } - inbyte = (uint8_t *) newinbyte; - initial = copyinitial; - i += 4; - } - finish: - assert(finalinbyte >= inbyte); - assert(i<=nvalue); - static const int REASONABLEBUFFER = 256; - if (nvalue - i + 1 > REASONABLEBUFFER) { - if (nvalue == i) { // straight append - const uint8_t * const newfinalinbyte = headlessEncode(&key, 1, - initial, inbyte); - return newfinalinbyte - initinbyte; - } - if (nvalue - i <= 4) { - // easy case - uint32_t tmpbuffer[5]; - tmpbuffer[0] = key; - size_t decoded = headlessDecode(inbyte, finalinbyte - inbyte, - initial, tmpbuffer + 1, nvalue - i); - assert(decoded == nvalue - i); - sortinfirstvalue(tmpbuffer, nvalue - i); - const uint8_t * const newfinalinbyte = headlessEncode(tmpbuffer, - nvalue - i + 1, initial, inbyte); - return newfinalinbyte - initinbyte; - } - // harder case - // this part is a bit complicated since we need to merge in the key - uint32_t readinitial = initial; - uint32_t tmpbuffer[5]; - tmpbuffer[0] = key; - const uint8_t * readinginbyte = decodeGroupVarIntDelta(inbyte, - &readinitial, tmpbuffer + 1); - assert(tmpbuffer[4] >= key); - assert(readinginbyte > inbyte); - - sortinfirstvalue(tmpbuffer, nvalue - i); - i += 4; - - // initialize blocks - - Block b1, b2; - - Block * block1 = &b1; - Block * block2 = &b2; - Block * blocktmp; - - // load block1 - - uint8_t * fb = encodeGroupVarIntDelta(block1->data, initial, - tmpbuffer); - - block1->length = static_cast(fb - block1->data); - uint32_t nextval = tmpbuffer[4] - tmpbuffer[3]; - uint32_t newsel = getByteLength(nextval) - 1; - // everything after that is just going to be shifting - while (nvalue - i >= 4) { - - // load block 2 - assert(readinginbyte >= inbyte); - readinginbyte = loadblock(block2, readinginbyte); - i += 4; - // shift in block 1 - shiftin(block2, &nextval, &newsel); - // write block1 - memcpy(inbyte, block1->data, block1->length); - inbyte += block1->length; - // block1 = block2 - blocktmp = block1; - block1 = block2; - block2 = blocktmp; - } - if (nvalue != i) { - readinginbyte = loadblockcarefully(block2, readinginbyte, - nvalue - i); - finalshiftin(block2, nextval, newsel, nvalue - i + 1);// nextval is useless here - memcpy(inbyte, block1->data, block1->length); - inbyte += block1->length; - memcpy(inbyte, block2->data, block2->length); - inbyte += block2->length; - return inbyte - initinbyte; - } else { - memcpy(inbyte, block1->data, block1->length); - inbyte += block1->length; - inbyte[0] = newsel; - inbyte++; - memcpy(inbyte, &nextval, newsel + 1); - inbyte += newsel + 1; - return inbyte - initinbyte; - } - // we are using brute force here, by decoding everything to a buffer and then reencoding. - } else { - uint32_t tmpbuffer[REASONABLEBUFFER]; - assert(tmpbuffer); - tmpbuffer[0] = key; - if (nvalue != i) { - size_t decoded = headlessDecode(inbyte, finalinbyte - inbyte, - initial, tmpbuffer + 1, nvalue - i); - assert(decoded == nvalue - i); - sortinfirstvalue(tmpbuffer, nvalue - i); - } - const uint8_t * const newfinalinbyte = headlessEncode(tmpbuffer, - nvalue - i + 1, initial, inbyte); - return newfinalinbyte - initinbyte; - } - } - - - // Performs a lower bound find in the encoded array. - // Returns the index - // assumes delta coding was used - size_t findLowerBound(const uint32_t *in, const size_t length, uint32_t key, - uint32_t *presult) { - const uint8_t *inbyte = reinterpret_cast(in); - uint32_t out[4] = { 0 }; - assert(delta); - size_t i = 0; - uint32_t initial = 0; - uint32_t nvalue = *in; - - inbyte += 4; // skip nvalue - - const uint8_t * const endbyte = reinterpret_cast(in - + length); - while (i + 3 < nvalue) { - uint32_t gap1, gap2, gap3, gap4; - uint32_t gap12, gap34; - - const uint32_t sel = *inbyte++; - if (sel == 0) { - gap1 = static_cast(inbyte[0]); - gap2 = static_cast(inbyte[1]); - gap12 = gap1 + gap2; - gap3 = static_cast(inbyte[2]); - gap4 = static_cast(inbyte[3]); - gap34 = gap3 + gap4; - inbyte += 4; - } else { - const uint32_t sel1 = (sel & 3); - gap1 = *(reinterpret_cast(inbyte)) - & mask[sel1]; - inbyte += sel1 + 1; - const uint32_t sel2 = ((sel >> 2) & 3); - gap2 = *(reinterpret_cast(inbyte)) - & mask[sel2]; - gap12 = gap1 + gap2; - inbyte += sel2 + 1; - const uint32_t sel3 = ((sel >> 4) & 3); - gap3 = *(reinterpret_cast(inbyte)) - & mask[sel3]; - inbyte += sel3 + 1; - const uint32_t sel4 = (sel >> 6); - gap4 = *(reinterpret_cast(inbyte)) - & mask[sel4]; - gap34 = gap3 + gap4; - inbyte += sel4 + 1; - } - initial += gap12 + gap34; - if (key <= initial) { - if (key <= initial - gap34 - gap2) { - *presult = initial - gap34 - gap2; - return (i + 0); - } - if (key <= initial - gap34) { - *presult = initial - gap34; - return (i + 1); - } - if (key <= initial - gap4) { - *presult = initial - gap4; - return (i + 2); - } - *presult = initial; - return (i + 3); - } - i += 4; - } - if (endbyte > inbyte && nvalue > i) { - uint32_t tnvalue = static_cast(nvalue - 1 - i); - inbyte = decodeCarefully(inbyte, &initial, out, tnvalue); - assert(inbyte <= endbyte); - if (key <= out[0]) { - *presult = out[0]; - return (i + 0); - } - if (tnvalue > 0 && key <= out[1]) { - *presult = out[1]; - return (i + 1); - } - if (tnvalue > 1 && key <= out[2]) { - *presult = out[2]; - return (i + 2); - } - if (tnvalue > 2 && key <= out[3]) { - *presult = out[3]; - return (i + 3); - } - - } - assert(false); - *presult = key + 1; - return (i); - } - - - // Returns a decompressed value in an encoded array - // This code has been optimized for delta-encoded arrays (TODO: optimize for the regular case). - uint32_t select(uint32_t *in, size_t index) { - const uint8_t *inbyte = reinterpret_cast (in); - uint32_t out[4]; - out[0] = 0; out[1] = 0; out[2] = 0; out[3] = 0; - size_t i = 0; - uint32_t initial = 0; - uint32_t nvalue = *in; - inbyte += 4; // skip nvalue - if(index + 3 < nvalue) {// this common case can be done with fewer branches - while(i + 4 <= index) { - inbyte = delta ? scanGroupVarIntDelta(inbyte, &initial) : - scanGroupVarInt(inbyte); // note: delta known at compile time: this is not a branch - i += 4; - } - inbyte = delta ? decodeGroupVarIntDelta(inbyte, &initial, out) : - decodeGroupVarInt(inbyte, out); // note: delta known at compile time: this is not a branch - return (out[index - i]); - }// else - // we finish with the uncommon case - while (i + 3 < index) { // a single branch will do for this case (bulk of the computation) - inbyte = delta ? scanGroupVarIntDelta(inbyte, &initial) : - scanGroupVarInt(inbyte); - i += 4; - } - // lots of branching ahead... - while (i + 3 < nvalue) { - inbyte = delta ? decodeGroupVarIntDelta(inbyte, &initial, out) : - decodeGroupVarInt(inbyte, out); - i += 4; - if (i > index) - return (out[index - (i - 4)]); - } - { - nvalue = static_cast(nvalue - i); - inbyte = decodeCarefully(inbyte, &initial, out, nvalue); - if (index == i) - return (out[0]); - if (nvalue > 1 && index == i + 1) - return (out[1]); - if (nvalue > 2 && index == i + 2) - return (out[2]); - if (nvalue > 3 && index == i + 3) - return (out[3]); + if (k < length) { + uint8_t *keyp = bout++; + *keyp = 0; + for (int j = 0; k < length && j < 8; j += 2, ++k) { + const uint32_t val = delta ? in[k] - prev : in[k]; + if (delta) + prev = in[k]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp |= static_cast(1 << j); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp |= static_cast(2 << j); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp |= static_cast(3 << j); } - assert(false);// we should never get here - return (0); + } } - - - string name() const { - if(delta) - return "varintgbdelta"; - else - return "varintgb"; + return bout; + } + + // returns how many values were decoded to out, will try to decode + // desirednumber + // if input allows. + size_t headlessDecode(const uint8_t *inbyte, const size_t length, + uint32_t prev, uint32_t *out, + const size_t desirednumber) { + + uint32_t *initout = out; + const uint32_t *const endout = out + desirednumber; + const uint8_t *const endbyte = inbyte + length; + uint32_t val; + while ((endbyte > inbyte + 4 * 4)) { //&& (endout > out + 3) + inbyte = delta ? decodeGroupVarIntDelta(inbyte, &prev, out) + : decodeGroupVarInt(inbyte, out); + out += 4; } - - uint8_t * headlessEncode(uint32_t *in, const size_t length,uint32_t prev, - uint8_t * bout) { - size_t k = 0; - for (; k + 3 < length; k += 4) { - uint8_t * keyp = bout++; - *keyp = 0; - { - const uint32_t val = delta ? in[k] - prev : in[k]; - if (delta) - prev = in[k]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp = static_cast(1); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp = static_cast(2); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp = static_cast(3); - } - } - { - const uint32_t val = delta ? in[k + 1] - prev : in[k + 1]; - if (delta) - prev = in[k + 1]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp |= static_cast(1 << 2); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp |= static_cast(2 << 2); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp |= static_cast(3 << 2); - } - } - { - const uint32_t val = delta ? in[k + 2] - prev : in[k + 2]; - if (delta) - prev = in[k + 2]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp |= static_cast(1 << 4); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp |= static_cast(2 << 4); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp |= static_cast(3 << 4); - } - } - { - const uint32_t val = delta ? in[k + 3] - prev : in[k + 3]; - if (delta) - prev = in[k + 3]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp |= static_cast(1 << 6); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp |= static_cast(2 << 6); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp |= static_cast(3 << 6); - } - } - } - if (k < length) { - uint8_t * keyp = bout++; - *keyp = 0; - for (int j = 0; k < length && j < 8; j += 2, ++k) { - const uint32_t val = delta ? in[k] - prev : in[k]; - if (delta) - prev = in[k]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp |= static_cast(1 << j); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp |= static_cast(2 << j); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp |= static_cast(3 << j); - } - } - } - return bout; - } - - // returns how many values were decoded to out, will try to decode desirednumber - // if input allows. - size_t headlessDecode(const uint8_t * inbyte, const size_t length, uint32_t prev, - uint32_t *out, const size_t desirednumber) { - - uint32_t * initout = out; - const uint32_t * const endout = out + desirednumber; - const uint8_t * const endbyte = inbyte + length; - uint32_t val; - while ((endbyte > inbyte + 4 * 4 ) ) {//&& (endout > out + 3) - inbyte = delta ? decodeGroupVarIntDelta(inbyte, &prev, out) : - decodeGroupVarInt(inbyte, out); - out+=4; - } - while (endbyte > inbyte + 1) { - uint8_t key = *inbyte++; - //printf("last key is %u \n",key); - for (int k = 0; (k < 4) && (endout > out); k++) { - const uint32_t howmanybyte = key & 3; - //printf("last key is %u howmanybyte = %u \n",key, howmanybyte+1); - - key=static_cast(key>>2); - val = static_cast (*inbyte++); - if (howmanybyte >= 1) { - val |= (static_cast (*inbyte++) << 8) ; - if (howmanybyte >= 2) { - val |= (static_cast (*inbyte++) << 16) ; - if (howmanybyte >= 3) { - val |= (static_cast (*inbyte++) << 24); - } - } - } - //printf("decoded %u\n",val); - prev = (delta ? prev : 0) + val; - //printf("writing %u\n",prev); - - *out++ = prev; + while (endbyte > inbyte + 1) { + uint8_t key = *inbyte++; + // printf("last key is %u \n",key); + for (int k = 0; (k < 4) && (endout > out); k++) { + const uint32_t howmanybyte = key & 3; + // printf("last key is %u howmanybyte = %u \n",key, howmanybyte+1); + + key = static_cast(key >> 2); + val = static_cast(*inbyte++); + if (howmanybyte >= 1) { + val |= (static_cast(*inbyte++) << 8); + if (howmanybyte >= 2) { + val |= (static_cast(*inbyte++) << 16); + if (howmanybyte >= 3) { + val |= (static_cast(*inbyte++) << 24); } - assert(inbyte <= endbyte); + } } - return out - initout; - } + // printf("decoded %u\n",val); + prev = (delta ? prev : 0) + val; + // printf("writing %u\n",prev); + *out++ = prev; + } + assert(inbyte <= endbyte); + } + return out - initout; + } private: + const uint32_t mask[4] = {0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF}; + void sortinfirstvalue(uint32_t *tmpbuffer, size_t length) { + size_t top = length < 4 ? length : 4; - const uint32_t mask[4] = { 0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF }; - - - void sortinfirstvalue(uint32_t * tmpbuffer, size_t length) { - size_t top = length < 4 ? length : 4; - - for (size_t j = 0; j < top; ++j) { - if (tmpbuffer[j] > tmpbuffer[j + 1]) { - uint32_t t = tmpbuffer[j + 1]; - tmpbuffer[j + 1] = tmpbuffer[j]; - tmpbuffer[j] = t; - } - } + for (size_t j = 0; j < top; ++j) { + if (tmpbuffer[j] > tmpbuffer[j + 1]) { + uint32_t t = tmpbuffer[j + 1]; + tmpbuffer[j + 1] = tmpbuffer[j]; + tmpbuffer[j] = t; + } } - - const uint8_t* decodeGroupVarInt(const uint8_t* in, uint32_t* out) { - const uint32_t sel = *in++; - if(sel == 0) { - out[0] = static_cast (in[0]); - out[1] = static_cast (in[1]); - out[2] = static_cast (in[2]); - out[3] = static_cast (in[3]); - return in + 4; - } - const uint32_t sel1 = (sel & 3); - *out++ = *((uint32_t*)(in)) & mask[sel1]; - in += sel1 + 1; - const uint32_t sel2 = ((sel >> 2) & 3); - *out++ = *((uint32_t*)(in)) & mask[sel2]; - in += sel2 + 1; - const uint32_t sel3 = ((sel >> 4) & 3); - *out++ = *((uint32_t*)(in)) & mask[sel3]; - in += sel3 + 1; - const uint32_t sel4 = (sel >> 6); - *out++ = *((uint32_t*)(in)) & mask[sel4]; - in += sel4 + 1; - return in; + } + + const uint8_t *decodeGroupVarInt(const uint8_t *in, uint32_t *out) { + const uint32_t sel = *in++; + if (sel == 0) { + out[0] = static_cast(in[0]); + out[1] = static_cast(in[1]); + out[2] = static_cast(in[2]); + out[3] = static_cast(in[3]); + return in + 4; } - - const uint8_t* decodeGroupVarIntDelta(const uint8_t* in, uint32_t * val, uint32_t* out) { - const uint32_t sel = *in++; - if(sel == 0) { - out[0] = (* val += static_cast (in[0])); - out[1] = (* val += static_cast (in[1])); - out[2] = (* val += static_cast (in[2])); - out[3] = (* val += static_cast (in[3])); - return in + 4; - } - const uint32_t sel1 = (sel & 3); - *val += *((uint32_t*)(in)) & mask[sel1]; - *out++ = *val; - in += sel1 + 1; - const uint32_t sel2 = ((sel >> 2) & 3); - *val += *((uint32_t*)(in)) & mask[sel2]; - *out++ = *val; - in += sel2 + 1; - const uint32_t sel3 = ((sel >> 4) & 3); - *val += *((uint32_t*)(in)) & mask[sel3]; - *out++ = *val; - in += sel3 + 1; - const uint32_t sel4 = (sel >> 6); - *val += *((uint32_t*)(in)) & mask[sel4]; - *out++ = *val; - in += sel4 + 1; - return in; + const uint32_t sel1 = (sel & 3); + *out++ = *((uint32_t *)(in)) & mask[sel1]; + in += sel1 + 1; + const uint32_t sel2 = ((sel >> 2) & 3); + *out++ = *((uint32_t *)(in)) & mask[sel2]; + in += sel2 + 1; + const uint32_t sel3 = ((sel >> 4) & 3); + *out++ = *((uint32_t *)(in)) & mask[sel3]; + in += sel3 + 1; + const uint32_t sel4 = (sel >> 6); + *out++ = *((uint32_t *)(in)) & mask[sel4]; + in += sel4 + 1; + return in; + } + + const uint8_t *decodeGroupVarIntDelta(const uint8_t *in, uint32_t *val, + uint32_t *out) { + const uint32_t sel = *in++; + if (sel == 0) { + out[0] = (*val += static_cast(in[0])); + out[1] = (*val += static_cast(in[1])); + out[2] = (*val += static_cast(in[2])); + out[3] = (*val += static_cast(in[3])); + return in + 4; } - - const uint8_t *decodeCarefully(const uint8_t *inbyte, - uint32_t *initial, uint32_t *out, uint32_t count) { - uint32_t val; - uint32_t k, key = *inbyte++; - for (k = 0; k < count && k < 4; k++) { - const uint32_t howmanybyte = key & 3; - key = static_cast(key>>2); - val = static_cast (*inbyte++); - if (howmanybyte >= 1) { - val |= (static_cast (*inbyte++) << 8) ; - if (howmanybyte >= 2) { - val |= (static_cast (*inbyte++) << 16) ; - if (howmanybyte >= 3) { - val |= (static_cast (*inbyte++) << 24); - } - } - } - if(delta) { - *initial += val; - *out = *initial; - } else { - *out = val; - } - out++; + const uint32_t sel1 = (sel & 3); + *val += *((uint32_t *)(in)) & mask[sel1]; + *out++ = *val; + in += sel1 + 1; + const uint32_t sel2 = ((sel >> 2) & 3); + *val += *((uint32_t *)(in)) & mask[sel2]; + *out++ = *val; + in += sel2 + 1; + const uint32_t sel3 = ((sel >> 4) & 3); + *val += *((uint32_t *)(in)) & mask[sel3]; + *out++ = *val; + in += sel3 + 1; + const uint32_t sel4 = (sel >> 6); + *val += *((uint32_t *)(in)) & mask[sel4]; + *out++ = *val; + in += sel4 + 1; + return in; + } + + const uint8_t *decodeCarefully(const uint8_t *inbyte, uint32_t *initial, + uint32_t *out, uint32_t count) { + uint32_t val; + uint32_t k, key = *inbyte++; + for (k = 0; k < count && k < 4; k++) { + const uint32_t howmanybyte = key & 3; + key = static_cast(key >> 2); + val = static_cast(*inbyte++); + if (howmanybyte >= 1) { + val |= (static_cast(*inbyte++) << 8); + if (howmanybyte >= 2) { + val |= (static_cast(*inbyte++) << 16); + if (howmanybyte >= 3) { + val |= (static_cast(*inbyte++) << 24); + } } - return (inbyte); + } + if (delta) { + *initial += val; + *out = *initial; + } else { + *out = val; + } + out++; } - - - const uint8_t* scanGroupVarIntDelta(const uint8_t* in, uint32_t * val) { - const uint32_t sel = *in++; - if (sel == 0) { - *val += static_cast(in[0]); - *val += static_cast(in[1]); - *val += static_cast(in[2]); - *val += static_cast(in[3]); - return in + 4; - } - const uint32_t sel1 = (sel & 3); - *val += *(reinterpret_cast(in)) & mask[sel1]; - in += sel1 + 1; - const uint32_t sel2 = ((sel >> 2) & 3); - *val += *(reinterpret_cast(in)) & mask[sel2]; - in += sel2 + 1; - const uint32_t sel3 = ((sel >> 4) & 3); - *val += *(reinterpret_cast(in)) & mask[sel3]; - in += sel3 + 1; - const uint32_t sel4 = (sel >> 6); - *val += *(reinterpret_cast(in)) & mask[sel4]; - in += sel4 + 1; - return in; - } - - - const uint8_t* scanGroupVarInt(const uint8_t* in) { - const uint32_t sel = *in++; - if (sel == 0) { - return in + 4; - } - const uint32_t sel1 = (sel & 3); - in += sel1 + 1; - const uint32_t sel2 = ((sel >> 2) & 3); - in += sel2 + 1; - const uint32_t sel3 = ((sel >> 4) & 3); - in += sel3 + 1; - const uint32_t sel4 = (sel >> 6); - in += sel4 + 1; - return in; - } - - // encode 4 integers - uint8_t * encodeGroupVarIntDelta(uint8_t *bout, uint32_t prev, - uint32_t * in) { - uint8_t * keyp = bout++; - *keyp = 0; - { - const uint32_t val = in[0] - prev; - prev = in[0]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp = static_cast(1); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp = static_cast(2); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp = static_cast(3); - } - } - { - const uint32_t val = in[1] - prev; - prev = in[1]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp |= static_cast(1 << 2); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp |= static_cast(2 << 2); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp |= static_cast(3 << 2); - } - } - { - const uint32_t val = in[2] - prev; - prev = in[2]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp |= static_cast(1 << 4); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp |= static_cast(2 << 4); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp |= static_cast(3 << 4); - } - } - { - const uint32_t val = in[3] - prev; - prev = in[3]; - if (val < (1U << 8)) { - *bout++ = static_cast(val); - } else if (val < (1U << 16)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *keyp |= static_cast(1 << 6); - } else if (val < (1U << 24)) { - *bout++ = static_cast(val); - *bout++ = static_cast(val >> 8); - *bout++ = static_cast(val >> 16); - *keyp |= static_cast(2 << 6); - } else { - // the compiler will do the right thing - *reinterpret_cast(bout) = val; - bout += 4; - *keyp |= static_cast(3 << 6); - } - } - return bout; - } - - uint32_t getByteLength(uint32_t val) { - if (val < (1U << 8)) { - return 1; - } else if (val < (1U << 16)) { - return 2; - } else if (val < (1U << 24)) { - return 3; - } else { - return 4; - } - } - - struct Block {// should fit in two cache lines. - uint8_t data[4*4+1 + 3];// the final +3 is a safety buffer - uint32_t length; - }; - - uint8_t lengths[256] = {5,6,7,8,6,7,8,9,7,8,9,10,8,9,10,11,6,7,8,9,7,8,9,10,8,9,10,11,9,10,11,12,7,8,9,10,8,9,10,11,9,10,11,12,10,11,12,13,8,9,10,11,9,10,11,12,10,11,12,13,11,12,13,14,6,7,8,9,7,8,9,10,8,9,10,11,9,10,11,12,7,8,9,10,8,9,10,11,9,10,11,12,10,11,12,13,8,9,10,11,9,10,11,12,10,11,12,13,11,12,13,14,9,10,11,12,10,11,12,13,11,12,13,14,12,13,14,15,7,8,9,10,8,9,10,11,9,10,11,12,10,11,12,13,8,9,10,11,9,10,11,12,10,11,12,13,11,12,13,14,9,10,11,12,10,11,12,13,11,12,13,14,12,13,14,15,10,11,12,13,11,12,13,14,12,13,14,15,13,14,15,16,8,9,10,11,9,10,11,12,10,11,12,13,11,12,13,14,9,10,11,12,10,11,12,13,11,12,13,14,12,13,14,15,10,11,12,13,11,12,13,14,12,13,14,15,13,14,15,16,11,12,13,14,12,13,14,15,13,14,15,16,14,15,16,17}; - - const uint8_t * loadblock(Block * b, const uint8_t * readinginbyte) { - b->length = lengths[readinginbyte[0]]; - memcpy(b->data,readinginbyte,b->length); - return readinginbyte+b->length; - } - - const uint8_t * loadblockcarefully(Block * b, const uint8_t * readinginbyte, size_t howmanyvals) { - b->length = 1; - for(size_t k = 0; k < howmanyvals; ++k) - b->length += 1 + ((readinginbyte[0]>>(2*k)) & 3); - memcpy(b->data,readinginbyte,b->length); - return readinginbyte+b->length; - } - - void shiftin(Block * b, uint32_t * nextval, uint32_t * newsel) { - uint32_t offsettolastval = lengths[b->data[0] & 63 ] - 1; - uint32_t newnextsel = b->data[0] >> 6; - uint32_t newnextval; - memcpy(&newnextval,b->data + offsettolastval,4); - newnextval &= mask[newnextsel]; - //uint32_t newnextval = *(reinterpret_cast(b->data + offsettolastval)) & mask[newnextsel]; - b->data[0] = (b->data[0]<<2) | *newsel; - std::memmove(b->data + 2 + *newsel,b->data + 1,b->length - 1 - 1 - newnextsel); - b->length = offsettolastval + 1 + *newsel; - std::memcpy(b->data + 1, nextval,*newsel + 1); - *nextval = newnextval; - *newsel = newnextsel; - } - - void finalshiftin(Block * b, uint32_t nextval, uint32_t newsel, size_t howmany) { - b->data[0] = (b->data[0]<<2) | newsel; - std::memmove(b->data + 2 + newsel,b->data + 1,b->length - 1 ); - b->length = 1; - for(size_t k = 0; k < howmany; ++k) - b->length += 1 + ((b->data[0]>>(2*k)) & 3); - std::memcpy(b->data + 1, &nextval,newsel + 1); - } - - - + return (inbyte); + } + + const uint8_t *scanGroupVarIntDelta(const uint8_t *in, uint32_t *val) { + const uint32_t sel = *in++; + if (sel == 0) { + *val += static_cast(in[0]); + *val += static_cast(in[1]); + *val += static_cast(in[2]); + *val += static_cast(in[3]); + return in + 4; + } + const uint32_t sel1 = (sel & 3); + *val += *(reinterpret_cast(in)) & mask[sel1]; + in += sel1 + 1; + const uint32_t sel2 = ((sel >> 2) & 3); + *val += *(reinterpret_cast(in)) & mask[sel2]; + in += sel2 + 1; + const uint32_t sel3 = ((sel >> 4) & 3); + *val += *(reinterpret_cast(in)) & mask[sel3]; + in += sel3 + 1; + const uint32_t sel4 = (sel >> 6); + *val += *(reinterpret_cast(in)) & mask[sel4]; + in += sel4 + 1; + return in; + } + + const uint8_t *scanGroupVarInt(const uint8_t *in) { + const uint32_t sel = *in++; + if (sel == 0) { + return in + 4; + } + const uint32_t sel1 = (sel & 3); + in += sel1 + 1; + const uint32_t sel2 = ((sel >> 2) & 3); + in += sel2 + 1; + const uint32_t sel3 = ((sel >> 4) & 3); + in += sel3 + 1; + const uint32_t sel4 = (sel >> 6); + in += sel4 + 1; + return in; + } + + // encode 4 integers + uint8_t *encodeGroupVarIntDelta(uint8_t *bout, uint32_t prev, uint32_t *in) { + uint8_t *keyp = bout++; + *keyp = 0; + { + const uint32_t val = in[0] - prev; + prev = in[0]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp = static_cast(1); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp = static_cast(2); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp = static_cast(3); + } + } + { + const uint32_t val = in[1] - prev; + prev = in[1]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp |= static_cast(1 << 2); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp |= static_cast(2 << 2); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp |= static_cast(3 << 2); + } + } + { + const uint32_t val = in[2] - prev; + prev = in[2]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp |= static_cast(1 << 4); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp |= static_cast(2 << 4); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp |= static_cast(3 << 4); + } + } + { + const uint32_t val = in[3] - prev; + prev = in[3]; + if (val < (1U << 8)) { + *bout++ = static_cast(val); + } else if (val < (1U << 16)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *keyp |= static_cast(1 << 6); + } else if (val < (1U << 24)) { + *bout++ = static_cast(val); + *bout++ = static_cast(val >> 8); + *bout++ = static_cast(val >> 16); + *keyp |= static_cast(2 << 6); + } else { + // the compiler will do the right thing + *reinterpret_cast(bout) = val; + bout += 4; + *keyp |= static_cast(3 << 6); + } + } + return bout; + } + + uint32_t getByteLength(uint32_t val) { + if (val < (1U << 8)) { + return 1; + } else if (val < (1U << 16)) { + return 2; + } else if (val < (1U << 24)) { + return 3; + } else { + return 4; + } + } + + struct Block { // should fit in two cache lines. + uint8_t data[4 * 4 + 1 + 3]; // the final +3 is a safety buffer + uint32_t length; + }; + + uint8_t lengths[256] = { + 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, + 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, + 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, + 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, + 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, + 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, + 11, 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16, 8, 9, 10, 11, 9, 10, + 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, + 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, + 14, 15, 13, 14, 15, 16, 11, 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16, + 14, 15, 16, 17}; + + const uint8_t *loadblock(Block *b, const uint8_t *readinginbyte) { + b->length = lengths[readinginbyte[0]]; + memcpy(b->data, readinginbyte, b->length); + return readinginbyte + b->length; + } + + const uint8_t *loadblockcarefully(Block *b, const uint8_t *readinginbyte, + size_t howmanyvals) { + b->length = 1; + for (size_t k = 0; k < howmanyvals; ++k) + b->length += 1 + ((readinginbyte[0] >> (2 * k)) & 3); + memcpy(b->data, readinginbyte, b->length); + return readinginbyte + b->length; + } + + void shiftin(Block *b, uint32_t *nextval, uint32_t *newsel) { + uint32_t offsettolastval = lengths[b->data[0] & 63] - 1; + uint32_t newnextsel = b->data[0] >> 6; + uint32_t newnextval; + memcpy(&newnextval, b->data + offsettolastval, 4); + newnextval &= mask[newnextsel]; + // uint32_t newnextval = *(reinterpret_cast(b->data + + // offsettolastval)) & mask[newnextsel]; + b->data[0] = (b->data[0] << 2) | *newsel; + std::memmove(b->data + 2 + *newsel, b->data + 1, + b->length - 1 - 1 - newnextsel); + b->length = offsettolastval + 1 + *newsel; + std::memcpy(b->data + 1, nextval, *newsel + 1); + *nextval = newnextval; + *newsel = newnextsel; + } + + void finalshiftin(Block *b, uint32_t nextval, uint32_t newsel, + size_t howmany) { + b->data[0] = (b->data[0] << 2) | newsel; + std::memmove(b->data + 2 + newsel, b->data + 1, b->length - 1); + b->length = 1; + for (size_t k = 0; k < howmany; ++k) + b->length += 1 + ((b->data[0] >> (2 * k)) & 3); + std::memcpy(b->data + 1, &nextval, newsel + 1); + } }; - - - - - - - - } // namespace SIMDCompressionLib #endif /* SIMDCompressionAndIntersection_VARINTGB_H_ */ diff --git a/src/benchintersection.cpp b/src/benchintersection.cpp index 4c2c51e..f4508ab 100644 --- a/src/benchintersection.cpp +++ b/src/benchintersection.cpp @@ -10,7 +10,7 @@ #include "intersection.h" // https://code.google.com/p/likwid/wiki/LikwidPerfCtr#Using_the_marker_API -#ifdef LIKWID_MARKERS // see 'make likwidintersection' for compiler flags +#ifdef LIKWID_MARKERS // see 'make likwidintersection' for compiler flags #include #endif @@ -26,53 +26,47 @@ using namespace SIMDCompressionLib; * * To match our clueweb, we use a range of values in [0,2**26). */ -template -pair, vector> getNaivePair(generator gen, - uint32_t minlength, uint32_t Max, float sizeratio, - float intersectionratio) { - if (sizeratio < 1) - throw runtime_error("sizeratio should be larger or equal to 1"); - if (intersectionratio < 0) - throw runtime_error("intersectionratio should be positive"); - if (intersectionratio > 1) - throw runtime_error("intersectionratio cannot be larger than 1"); - const uint32_t maxlenth = static_cast(round( - static_cast(minlength) * sizeratio)); - if (maxlenth > Max) - throw runtime_error( - "I can't generate an array so large in such a small range."); - if (maxlenth < minlength) - throw runtime_error("something went wrong, possibly an overflow."); - // we basically assume that, if we do nothing, intersections are very small - const uint32_t intersize = static_cast(round( - static_cast(minlength) * intersectionratio)); - - vector < uint32_t > inter = gen.generate(intersize, Max); - vector < uint32_t > smallest = unite( - gen.generate(static_cast(minlength - inter.size()), Max), - inter); - vector < uint32_t > largest = unite( - gen.generate(static_cast(maxlenth - inter.size()), Max), - inter); - vector < uint32_t > intersection = intersect(smallest, largest); - if (largest.size() > smallest.size()) - return pair, vector>(smallest, largest); - return pair, vector>(largest, smallest); - +template +pair, vector> +getNaivePair(generator gen, uint32_t minlength, uint32_t Max, float sizeratio, + float intersectionratio) { + if (sizeratio < 1) + throw runtime_error("sizeratio should be larger or equal to 1"); + if (intersectionratio < 0) + throw runtime_error("intersectionratio should be positive"); + if (intersectionratio > 1) + throw runtime_error("intersectionratio cannot be larger than 1"); + const uint32_t maxlenth = + static_cast(round(static_cast(minlength) * sizeratio)); + if (maxlenth > Max) + throw runtime_error( + "I can't generate an array so large in such a small range."); + if (maxlenth < minlength) + throw runtime_error("something went wrong, possibly an overflow."); + // we basically assume that, if we do nothing, intersections are very small + const uint32_t intersize = static_cast( + round(static_cast(minlength) * intersectionratio)); + + vector inter = gen.generate(intersize, Max); + vector smallest = + unite(gen.generate(static_cast(minlength - inter.size()), Max), + inter); + vector largest = unite( + gen.generate(static_cast(maxlenth - inter.size()), Max), inter); + vector intersection = intersect(smallest, largest); + if (largest.size() > smallest.size()) + return pair, vector>(smallest, largest); + return pair, vector>(largest, smallest); } /** * Silly function. */ -uint16_t _high16(uint32_t x) { - return static_cast(x >> 16); -} +uint16_t _high16(uint32_t x) { return static_cast(x >> 16); } /** * Another function. */ -uint16_t _low16(uint32_t x) { - return static_cast(x); -} +uint16_t _low16(uint32_t x) { return static_cast(x); } /** * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions @@ -81,335 +75,311 @@ uint16_t _low16(uint32_t x) { // s_a - size of A // R - partitioned sorted array size_t partition(const uint32_t *A, const size_t s_a, uint16_t *R, - const size_t /*Rlength*/) { - uint16_t high = 0; - int partition_length = 0; - size_t partition_size_position = 1; - size_t counter = 0; - size_t p = 0; - if (p < s_a) { - uint16_t chigh = _high16(A[p]); // upper dword - uint16_t clow = _low16(A[p]); // lower dword - if (chigh == 0) { - R[counter++] = chigh; // partition prefix - R[counter++] = 0; // reserve place for partition size - R[counter++] = clow; // write the first element - partition_length = 1; // reset counters - high = chigh; - ++p; - } - - } - for (; p < s_a; p++) { - uint16_t chigh = _high16(A[p]); // upper dword - uint16_t clow = _low16(A[p]); // lower dword - if (chigh == high && p != 0) { // add element to the current partition - R[counter++] = clow; - partition_length++; - } else { // start new partition - R[counter++] = chigh; // partition prefix - R[counter++] = 0; // reserve place for partition size - R[counter++] = clow; // write the first element - R[partition_size_position] = static_cast(partition_length - - 1); // store "-1" - partition_length = 1; // reset counters - partition_size_position = counter - 2; - high = chigh; - } - } - R[partition_size_position] = static_cast(partition_length - 1); - - return counter; + const size_t /*Rlength*/) { + uint16_t high = 0; + int partition_length = 0; + size_t partition_size_position = 1; + size_t counter = 0; + size_t p = 0; + if (p < s_a) { + uint16_t chigh = _high16(A[p]); // upper dword + uint16_t clow = _low16(A[p]); // lower dword + if (chigh == 0) { + R[counter++] = chigh; // partition prefix + R[counter++] = 0; // reserve place for partition size + R[counter++] = clow; // write the first element + partition_length = 1; // reset counters + high = chigh; + ++p; + } + } + for (; p < s_a; p++) { + uint16_t chigh = _high16(A[p]); // upper dword + uint16_t clow = _low16(A[p]); // lower dword + if (chigh == high && p != 0) { // add element to the current partition + R[counter++] = clow; + partition_length++; + } else { // start new partition + R[counter++] = chigh; // partition prefix + R[counter++] = 0; // reserve place for partition size + R[counter++] = clow; // write the first element + R[partition_size_position] = + static_cast(partition_length - 1); // store "-1" + partition_length = 1; // reset counters + partition_size_position = counter - 2; + high = chigh; + } + } + R[partition_size_position] = static_cast(partition_length - 1); + + return counter; } -const static __m128i shuffle_mask16[256] = { _mm_setr_epi8(-1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, - 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(4, - 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, 7, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 6, 7, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8( - 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, -1, - -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(8, 9, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 8, - 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(4, 5, 8, 9, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 4, 5, 8, 9, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 8, 9, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, - 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, - 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1), _mm_setr_epi8(0, 1, 10, 11, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 10, 11, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 10, - 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(4, 5, - 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 10, 11, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, 7, 10, 11, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 6, 7, - 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8( - 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 10, - 11, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(8, 9, 10, 11, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 8, - 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(4, 5, 8, 9, 10, 11, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 4, 5, 8, 9, 10, 11, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 8, 9, - 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, - 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, - 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1), - _mm_setr_epi8(12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1), _mm_setr_epi8(0, 1, 12, 13, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 12, 13, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 12, - 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(4, 5, - 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 12, 13, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, 7, 12, 13, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 6, 7, - 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8( - 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, - 13, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(8, 9, 12, 13, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 8, - 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(4, 5, 8, 9, 12, 13, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 8, 9, - 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, - 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, - 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1), - _mm_setr_epi8(10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1), _mm_setr_epi8(0, 1, 10, 11, 12, 13, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 10, 11, 12, 13, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 10, - 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(4, 5, - 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, - -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 10, 11, 12, 13, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, 7, 10, 11, 12, 13, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 6, 7, - 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8( - 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 10, - 11, 12, 13, -1, -1, -1, -1), _mm_setr_epi8(8, 9, 10, 11, 12, 13, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 8, - 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(4, 5, 8, 9, 10, 11, 12, 13, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 4, 5, 8, 9, 10, 11, - 12, 13, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 8, 9, - 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, - 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1), _mm_setr_epi8(6, - 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1), - _mm_setr_epi8(14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1), _mm_setr_epi8(0, 1, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 14, 15, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 14, - 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(4, 5, - 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 14, 15, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 6, 7, - 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8( - 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 14, - 15, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(8, 9, 14, 15, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 8, - 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(4, 5, 8, 9, 14, 15, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 4, 5, 8, 9, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 8, 9, - 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, - 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, - 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1), _mm_setr_epi8(0, 1, 10, 11, 14, 15, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 10, 11, 14, 15, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 10, - 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(4, 5, - 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, - -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 10, 11, 14, 15, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, 7, 10, 11, 14, 15, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 6, 7, - 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8( - 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 10, - 11, 14, 15, -1, -1, -1, -1), _mm_setr_epi8(8, 9, 10, 11, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 8, - 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(4, 5, 8, 9, 10, 11, 14, 15, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 4, 5, 8, 9, 10, 11, - 14, 15, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 8, 9, - 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, - 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1), _mm_setr_epi8(6, - 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1), - _mm_setr_epi8(12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1), _mm_setr_epi8(0, 1, 12, 13, 14, 15, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 12, 13, 14, 15, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 12, - 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(4, 5, - 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, - -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(6, 7, 12, 13, 14, 15, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 6, 7, - 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8( - 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, - 13, 14, 15, -1, -1, -1, -1), _mm_setr_epi8(8, 9, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 8, - 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, - -1, -1, -1), _mm_setr_epi8(4, 5, 8, 9, 12, 13, 14, 15, -1, -1, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, - 14, 15, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 8, 9, - 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, - 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1), _mm_setr_epi8(6, - 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1), - _mm_setr_epi8(10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1), _mm_setr_epi8(0, 1, 10, 11, 12, 13, 14, 15, -1, -1, -1, - -1, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 10, - 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(4, 5, - 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, - -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 10, 11, 12, 13, - 14, 15, -1, -1, -1, -1), _mm_setr_epi8(6, 7, 10, 11, 12, 13, 14, - 15, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 6, 7, - 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), _mm_setr_epi8( - 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, - -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, - 15, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 10, - 11, 12, 13, 14, 15, -1, -1), _mm_setr_epi8(8, 9, 10, 11, 12, 13, - 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 8, - 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, - -1), _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, - -1, -1, -1), _mm_setr_epi8(4, 5, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 4, 5, 8, 9, 10, 11, - 12, 13, 14, 15, -1, -1, -1, -1), _mm_setr_epi8(2, 3, 4, 5, 8, 9, - 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), _mm_setr_epi8(0, 1, 2, - 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1), _mm_setr_epi8(6, - 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1), - _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), - _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1), - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1), - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) }; +const static __m128i shuffle_mask16[256] = { + _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1), + _mm_setr_epi8(0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1), + _mm_setr_epi8(0, 1, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1), + _mm_setr_epi8(12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1), + _mm_setr_epi8(0, 1, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1), + _mm_setr_epi8(10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1), + _mm_setr_epi8(0, 1, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1), + _mm_setr_epi8(8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1), + _mm_setr_epi8(14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1), + _mm_setr_epi8(0, 1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1), + _mm_setr_epi8(0, 1, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1), + _mm_setr_epi8(12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1), + _mm_setr_epi8(0, 1, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1), + _mm_setr_epi8(10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1), + _mm_setr_epi8(0, 1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1), + _mm_setr_epi8(8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1), + _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1), + _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1), + _mm_setr_epi8(0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1), + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1), + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)}; /** * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions @@ -417,57 +387,57 @@ const static __m128i shuffle_mask16[256] = { _mm_setr_epi8(-1, -1, -1, -1, -1, * Close to the original */ static size_t _original_intersect_vector16(const uint16_t *A, const uint16_t *B, - const size_t s_a, const size_t s_b, uint16_t *C) { - size_t count = 0; - size_t i_a = 0, i_b = 0; - - const size_t st_a = (s_a / 8) * 8; - const size_t st_b = (s_b / 8) * 8; - __m128i v_a, v_b; - if ((i_a < st_a) and (i_b < st_b)) { - v_a = _mm_loadu_si128((const __m128i *) &A[i_a]); - v_b = _mm_loadu_si128((const __m128i *) &B[i_b]); - if ((i_a < st_a) and (i_b < st_b)) - while (true) { - const __m128i res_v = _mm_cmpestrm(v_b, 16, v_a, 16, - _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); - const int r = _mm_extract_epi32(res_v, 0); - __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); - _mm_storeu_si128((__m128i *) &C[count], p); - count += _mm_popcnt_u32(r); - const uint16_t a_max = A[i_a + 7]; - const uint16_t b_max = B[i_b + 7]; - if (a_max <= b_max) { - i_a += 8; - if (i_a == st_a) - break; - v_a = _mm_loadu_si128((const __m128i *) &A[i_a]); - - } - if (b_max <= a_max) { - i_b += 8; - if (i_b == st_b) - break; - v_b = _mm_loadu_si128((const __m128i *) &B[i_b]); - - } - } - } - // intersect the tail using scalar intersection - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - i_a++; - } else if (B[i_b] < A[i_a]) { - i_b++; - } else { - C[count] = A[i_a]; - count++; - i_a++; - i_b++; - } - } - - return count; + const size_t s_a, const size_t s_b, + uint16_t *C) { + size_t count = 0; + size_t i_a = 0, i_b = 0; + + const size_t st_a = (s_a / 8) * 8; + const size_t st_b = (s_b / 8) * 8; + __m128i v_a, v_b; + if ((i_a < st_a) and (i_b < st_b)) { + v_a = _mm_loadu_si128((const __m128i *)&A[i_a]); + v_b = _mm_loadu_si128((const __m128i *)&B[i_b]); + if ((i_a < st_a) and (i_b < st_b)) + while (true) { + const __m128i res_v = _mm_cmpestrm( + v_b, 16, v_a, 16, + _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); + _mm_storeu_si128((__m128i *)&C[count], p); + count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + 7]; + const uint16_t b_max = B[i_b + 7]; + if (a_max <= b_max) { + i_a += 8; + if (i_a == st_a) + break; + v_a = _mm_loadu_si128((const __m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += 8; + if (i_b == st_b) + break; + v_b = _mm_loadu_si128((const __m128i *)&B[i_b]); + } + } + } + // intersect the tail using scalar intersection + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + i_a++; + } else if (B[i_b] < A[i_a]) { + i_b++; + } else { + C[count] = A[i_a]; + count++; + i_a++; + i_b++; + } + } + + return count; } /** @@ -476,484 +446,490 @@ static size_t _original_intersect_vector16(const uint16_t *A, const uint16_t *B, * Optimized by D. Lemire on May 3rd 2013 */ static size_t _intersect_vector16(const uint16_t *A, const uint16_t *B, - const size_t s_a, const size_t s_b, uint16_t *C) { - size_t count = 0; - size_t i_a = 0, i_b = 0; - - const size_t st_a = (s_a / 8) * 8; - const size_t st_b = (s_b / 8) * 8; - __m128i v_a, v_b; - if ((i_a < st_a) and (i_b < st_b)) { - v_a = _mm_loadu_si128((const __m128i *) &A[i_a]); - v_b = _mm_loadu_si128((const __m128i *) &B[i_b]); - while ((A[i_a] == 0) or (B[i_b] == 0)) { - const __m128i res_v = _mm_cmpestrm(v_b, 8, v_a, 8, - _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); - const int r = _mm_extract_epi32(res_v, 0); - __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); - _mm_storeu_si128((__m128i *) &C[count], p); - count += _mm_popcnt_u32(r); - const uint16_t a_max = A[i_a + 7]; - const uint16_t b_max = B[i_b + 7]; - if (a_max <= b_max) { - i_a += 8; - if (i_a == st_a) - break; - v_a = _mm_loadu_si128((const __m128i *) &A[i_a]); - - } - if (b_max <= a_max) { - i_b += 8; - if (i_b == st_b) - break; - v_b = _mm_loadu_si128((const __m128i *) &B[i_b]); - - } - - } - if ((i_a < st_a) and (i_b < st_b)) - while (true) { - const __m128i res_v = _mm_cmpistrm(v_b, v_a, - _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); - const int r = _mm_extract_epi32(res_v, 0); - __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); - _mm_storeu_si128((__m128i *) &C[count], p); - count += _mm_popcnt_u32(r); - const uint16_t a_max = A[i_a + 7]; - const uint16_t b_max = B[i_b + 7]; - if (a_max <= b_max) { - i_a += 8; - if (i_a == st_a) - break; - v_a = _mm_loadu_si128((const __m128i *) &A[i_a]); - - } - if (b_max <= a_max) { - i_b += 8; - if (i_b == st_b) - break; - v_b = _mm_loadu_si128((const __m128i *) &B[i_b]); - - } - } - } - // intersect the tail using scalar intersection - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - i_a++; - } else if (B[i_b] < A[i_a]) { - i_b++; - } else { - C[count] = A[i_a]; - count++; - i_a++; - i_b++; - } - } - - return count; + const size_t s_a, const size_t s_b, + uint16_t *C) { + size_t count = 0; + size_t i_a = 0, i_b = 0; + + const size_t st_a = (s_a / 8) * 8; + const size_t st_b = (s_b / 8) * 8; + __m128i v_a, v_b; + if ((i_a < st_a) and (i_b < st_b)) { + v_a = _mm_loadu_si128((const __m128i *)&A[i_a]); + v_b = _mm_loadu_si128((const __m128i *)&B[i_b]); + while ((A[i_a] == 0) or (B[i_b] == 0)) { + const __m128i res_v = + _mm_cmpestrm(v_b, 8, v_a, 8, + _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); + _mm_storeu_si128((__m128i *)&C[count], p); + count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + 7]; + const uint16_t b_max = B[i_b + 7]; + if (a_max <= b_max) { + i_a += 8; + if (i_a == st_a) + break; + v_a = _mm_loadu_si128((const __m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += 8; + if (i_b == st_b) + break; + v_b = _mm_loadu_si128((const __m128i *)&B[i_b]); + } + } + if ((i_a < st_a) and (i_b < st_b)) + while (true) { + const __m128i res_v = _mm_cmpistrm( + v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); + const int r = _mm_extract_epi32(res_v, 0); + __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); + _mm_storeu_si128((__m128i *)&C[count], p); + count += _mm_popcnt_u32(r); + const uint16_t a_max = A[i_a + 7]; + const uint16_t b_max = B[i_b + 7]; + if (a_max <= b_max) { + i_a += 8; + if (i_a == st_a) + break; + v_a = _mm_loadu_si128((const __m128i *)&A[i_a]); + } + if (b_max <= a_max) { + i_b += 8; + if (i_b == st_b) + break; + v_b = _mm_loadu_si128((const __m128i *)&B[i_b]); + } + } + } + // intersect the tail using scalar intersection + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + i_a++; + } else if (B[i_b] < A[i_a]) { + i_b++; + } else { + C[count] = A[i_a]; + count++; + i_a++; + i_b++; + } + } + + return count; } static size_t _intersectV1_vector16(const uint16_t *A, const uint16_t *B, - const size_t s_a, const size_t s_b, uint16_t *C) { - if (s_a > s_b) - return _intersectV1_vector16(B, A, s_b, s_a, C); - size_t count = 0; - size_t i_a = 0, i_b = 0; - const size_t st_a = s_a; - const size_t st_b = (s_b / 8) * 8; - __m128i v_b; - if ((i_a < st_a) and (i_b < st_b)) { - v_b = _mm_loadu_si128((const __m128i *) &B[i_b]); - while (true) { - const __m128i v_a = _mm_set1_epi16(A[i_a]); - const __m128i F0 = _mm_cmpeq_epi16(v_a, v_b); + const size_t s_a, const size_t s_b, + uint16_t *C) { + if (s_a > s_b) + return _intersectV1_vector16(B, A, s_b, s_a, C); + size_t count = 0; + size_t i_a = 0, i_b = 0; + const size_t st_a = s_a; + const size_t st_b = (s_b / 8) * 8; + __m128i v_b; + if ((i_a < st_a) and (i_b < st_b)) { + v_b = _mm_loadu_si128((const __m128i *)&B[i_b]); + while (true) { + const __m128i v_a = _mm_set1_epi16(A[i_a]); + const __m128i F0 = _mm_cmpeq_epi16(v_a, v_b); #ifdef __SSE4_1__ - if (_mm_testz_si128(F0, F0)) { + if (_mm_testz_si128(F0, F0)) { #else - if (!_mm_movemask_epi8(F0)) { + if (!_mm_movemask_epi8(F0)) { #endif - } else { - C[count] = A[i_a]; - count++; - } - ++i_a; - if (i_a == st_a) - goto FINISH_SCALAR; - if (B[i_b + 7] < A[i_a]) { - i_b += 8; - if (i_b == st_b) - goto FINISH_SCALAR; - v_b = _mm_loadu_si128((const __m128i *) &B[i_b]); - } - } - } - FINISH_SCALAR: - // intersect the tail using scalar intersection - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - i_a++; - } else if (B[i_b] < A[i_a]) { - i_b++; - } else { - C[count] = A[i_a]; - count++; - i_a++; - i_b++; - } - } - - return count; + } else { + C[count] = A[i_a]; + count++; + } + ++i_a; + if (i_a == st_a) + goto FINISH_SCALAR; + if (B[i_b + 7] < A[i_a]) { + i_b += 8; + if (i_b == st_b) + goto FINISH_SCALAR; + v_b = _mm_loadu_si128((const __m128i *)&B[i_b]); + } + } + } +FINISH_SCALAR: + // intersect the tail using scalar intersection + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + i_a++; + } else if (B[i_b] < A[i_a]) { + i_b++; + } else { + C[count] = A[i_a]; + count++; + i_a++; + i_b++; + } + } + + return count; } static size_t _intersectscalar_vector16(const uint16_t *A, const uint16_t *B, - const size_t s_a, const size_t s_b, uint16_t *C) { - // intersect the tail using scalar intersection - size_t count = 0, i_a = 0, i_b = 0; - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - i_a++; - } else if (B[i_b] < A[i_a]) { - i_b++; - } else { - C[count] = A[i_a]; - count++; - i_a++; - i_b++; - } - } - - return count; + const size_t s_a, const size_t s_b, + uint16_t *C) { + // intersect the tail using scalar intersection + size_t count = 0, i_a = 0, i_b = 0; + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + i_a++; + } else if (B[i_b] < A[i_a]) { + i_b++; + } else { + C[count] = A[i_a]; + count++; + i_a++; + i_b++; + } + } + + return count; } size_t intersect_partitionedV1(const uint16_t *A, const size_t s_a, - const uint16_t *B, const size_t s_b, uint16_t * C) { - size_t i_a = 0, i_b = 0; - size_t counter = 0; - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - do { - i_a += static_cast(A[i_a + 1]) + 2 + 1; - if (i_a >= s_a) - goto end; - } while (A[i_a] < B[i_b]); - } - if (B[i_b] < A[i_a]) { - do { - i_b += static_cast(B[i_b + 1]) + 2 + 1; - if (i_b >= s_b) - goto end; - } while (B[i_b] < A[i_a]); - } else { - size_t partition_size = _intersectV1_vector16(&A[i_a + 2], - &B[i_b + 2], static_cast(A[i_a + 1]) + 1, - static_cast(B[i_b + 1]) + 1, &C[counter + 1]); - C[counter++] = partition_size; // write partition size - counter += partition_size; - i_a += static_cast(A[i_a + 1]) + 2 + 1; - i_b += static_cast(B[i_b + 1]) + 2 + 1; - } - } - end: return counter; + const uint16_t *B, const size_t s_b, + uint16_t *C) { + size_t i_a = 0, i_b = 0; + size_t counter = 0; + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + do { + i_a += static_cast(A[i_a + 1]) + 2 + 1; + if (i_a >= s_a) + goto end; + } while (A[i_a] < B[i_b]); + } + if (B[i_b] < A[i_a]) { + do { + i_b += static_cast(B[i_b + 1]) + 2 + 1; + if (i_b >= s_b) + goto end; + } while (B[i_b] < A[i_a]); + } else { + size_t partition_size = _intersectV1_vector16( + &A[i_a + 2], &B[i_b + 2], static_cast(A[i_a + 1]) + 1, + static_cast(B[i_b + 1]) + 1, &C[counter + 1]); + C[counter++] = partition_size; // write partition size + counter += partition_size; + i_a += static_cast(A[i_a + 1]) + 2 + 1; + i_b += static_cast(B[i_b + 1]) + 2 + 1; + } + } +end: + return counter; } size_t intersect_partitionedscalar(const uint16_t *A, const size_t s_a, - const uint16_t *B, const size_t s_b, uint16_t * C) { - size_t i_a = 0, i_b = 0; - size_t counter = 0; - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - do { - i_a += static_cast(A[i_a + 1]) + 2 + 1; - if (i_a >= s_a) - goto end; - } while (A[i_a] < B[i_b]); - } - if (B[i_b] < A[i_a]) { - do { - i_b += static_cast(B[i_b + 1]) + 2 + 1; - if (i_b >= s_b) - goto end; - } while (B[i_b] < A[i_a]); - } else { - size_t partition_size = _intersectscalar_vector16(&A[i_a + 2], - &B[i_b + 2], static_cast(A[i_a + 1]) + 1, - static_cast(B[i_b + 1]) + 1, &C[counter + 1]); - C[counter++] = partition_size; // write partition size - counter += partition_size; - i_a += static_cast(A[i_a + 1]) + 2 + 1; - i_b += static_cast(B[i_b + 1]) + 2 + 1; - } - } - end: return counter; + const uint16_t *B, const size_t s_b, + uint16_t *C) { + size_t i_a = 0, i_b = 0; + size_t counter = 0; + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + do { + i_a += static_cast(A[i_a + 1]) + 2 + 1; + if (i_a >= s_a) + goto end; + } while (A[i_a] < B[i_b]); + } + if (B[i_b] < A[i_a]) { + do { + i_b += static_cast(B[i_b + 1]) + 2 + 1; + if (i_b >= s_b) + goto end; + } while (B[i_b] < A[i_a]); + } else { + size_t partition_size = _intersectscalar_vector16( + &A[i_a + 2], &B[i_b + 2], static_cast(A[i_a + 1]) + 1, + static_cast(B[i_b + 1]) + 1, &C[counter + 1]); + C[counter++] = partition_size; // write partition size + counter += partition_size; + i_a += static_cast(A[i_a + 1]) + 2 + 1; + i_b += static_cast(B[i_b + 1]) + 2 + 1; + } + } +end: + return counter; } /** * Version optimized by D. Lemire of * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions */ size_t intersect_partitioned(const uint16_t *A, const size_t s_a, - const uint16_t *B, const size_t s_b, uint16_t * C) { - size_t i_a = 0, i_b = 0; - size_t counter = 0; - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - do { - i_a += static_cast(A[i_a + 1]) + 2 + 1; - if (i_a >= s_a) - goto end; - } while (A[i_a] < B[i_b]); - } - if (B[i_b] < A[i_a]) { - do { - i_b += static_cast(B[i_b + 1]) + 2 + 1; - if (i_b >= s_b) - goto end; - } while (B[i_b] < A[i_a]); - } else { - size_t partition_size = _intersect_vector16(&A[i_a + 2], - &B[i_b + 2], static_cast(A[i_a + 1]) + 1, - static_cast(B[i_b + 1]) + 1, &C[counter + 1]); - C[counter++] = partition_size; // write partition size - counter += partition_size; - i_a += static_cast(A[i_a + 1]) + 2 + 1; - i_b += static_cast(B[i_b + 1]) + 2 + 1; - } - } - end: return counter; + const uint16_t *B, const size_t s_b, uint16_t *C) { + size_t i_a = 0, i_b = 0; + size_t counter = 0; + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + do { + i_a += static_cast(A[i_a + 1]) + 2 + 1; + if (i_a >= s_a) + goto end; + } while (A[i_a] < B[i_b]); + } + if (B[i_b] < A[i_a]) { + do { + i_b += static_cast(B[i_b + 1]) + 2 + 1; + if (i_b >= s_b) + goto end; + } while (B[i_b] < A[i_a]); + } else { + size_t partition_size = _intersect_vector16( + &A[i_a + 2], &B[i_b + 2], static_cast(A[i_a + 1]) + 1, + static_cast(B[i_b + 1]) + 1, &C[counter + 1]); + C[counter++] = partition_size; // write partition size + counter += partition_size; + i_a += static_cast(A[i_a + 1]) + 2 + 1; + i_b += static_cast(B[i_b + 1]) + 2 + 1; + } + } +end: + return counter; } size_t original_intersect_partitioned(const uint16_t *A, const size_t s_a, - const uint16_t *B, const size_t s_b, uint16_t * C) { - size_t i_a = 0, i_b = 0; - size_t counter = 0; - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - do { - i_a += static_cast(A[i_a + 1]) + 2 + 1; - if (i_a >= s_a) - goto end; - } while (A[i_a] < B[i_b]); - } - if (B[i_b] < A[i_a]) { - do { - i_b += static_cast(B[i_b + 1]) + 2 + 1; - if (i_b >= s_b) - goto end; - } while (B[i_b] < A[i_a]); - } else { - size_t partition_size = _original_intersect_vector16(&A[i_a + 2], - &B[i_b + 2], static_cast(A[i_a + 1]) + 1, - static_cast(B[i_b + 1]) + 1, &C[counter + 1]); - C[counter++] = partition_size; // write partition size - counter += partition_size; - i_a += static_cast(A[i_a + 1]) + 2 + 1; - i_b += static_cast(B[i_b + 1]) + 2 + 1; - } - } - end: return counter; + const uint16_t *B, const size_t s_b, + uint16_t *C) { + size_t i_a = 0, i_b = 0; + size_t counter = 0; + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + do { + i_a += static_cast(A[i_a + 1]) + 2 + 1; + if (i_a >= s_a) + goto end; + } while (A[i_a] < B[i_b]); + } + if (B[i_b] < A[i_a]) { + do { + i_b += static_cast(B[i_b + 1]) + 2 + 1; + if (i_b >= s_b) + goto end; + } while (B[i_b] < A[i_a]); + } else { + size_t partition_size = _original_intersect_vector16( + &A[i_a + 2], &B[i_b + 2], static_cast(A[i_a + 1]) + 1, + static_cast(B[i_b + 1]) + 1, &C[counter + 1]); + C[counter++] = partition_size; // write partition size + counter += partition_size; + i_a += static_cast(A[i_a + 1]) + 2 + 1; + i_b += static_cast(B[i_b + 1]) + 2 + 1; + } + } +end: + return counter; } void printusage() { #ifdef LIKWID_MARKERS - cout << "example: likwid -m -C 1 -g BRANCH ./likwidintersection -u > uniform.out" << endl; + cout << "example: likwid -m -C 1 -g BRANCH ./likwidintersection -u > " + "uniform.out" + << endl; #else - cout << " -u switches to uniform distribution" << endl; + cout << " -u switches to uniform distribution" << endl; #endif } int main(int argc, char **argv) { - size_t howmany = 0; - size_t loop = 3; - bool uniform = false; - uint32_t Big = 22; - float intersectionratio = 0.3f; - uint32_t MaxBit = 26; - int c; - while ((c = getopt(argc, argv, "uns:m:R:M:S:l:h")) != -1) - switch (c) { - case 'h': - printusage(); - return 0; - case 'S': - Big = atoi(optarg); - break; - case 'R': - intersectionratio = atof(optarg); - break; - case 'M': - MaxBit = atoi(optarg); - if (MaxBit < 1) { - printusage(); - return -1; - } - break; - case 'm': - howmany = atoi(optarg); - if (howmany < 1) { - printusage(); - return -1; - } - break; - case 'l': - loop = atoi(optarg); - if (loop < 1) { - printusage(); - return -1; - } - break; - case 'u': - uniform = true; - break; - default: - abort(); - } - if (howmany == 0) { - howmany = 5; - } - cout << "# howmany : " << howmany << endl; - cout << "# loop : " << loop << endl; - cout << "# distribution : " << (uniform ? "uniform" : "clustered") << endl; - cout << "# Big : " << Big << endl; - cout << "# intersectionratio : " << intersectionratio << endl; - cout << "# MaxBit : " << MaxBit << endl; - UniformDataGenerator udg; - ClusteredDataGenerator cdg; - WallClockTimer z; - size_t bogus = 0; - vector < uint32_t > buffer(2 * (1U << Big)); + size_t howmany = 0; + size_t loop = 3; + bool uniform = false; + uint32_t Big = 22; + float intersectionratio = 0.3f; + uint32_t MaxBit = 26; + int c; + while ((c = getopt(argc, argv, "uns:m:R:M:S:l:h")) != -1) + switch (c) { + case 'h': + printusage(); + return 0; + case 'S': + Big = atoi(optarg); + break; + case 'R': + intersectionratio = atof(optarg); + break; + case 'M': + MaxBit = atoi(optarg); + if (MaxBit < 1) { + printusage(); + return -1; + } + break; + case 'm': + howmany = atoi(optarg); + if (howmany < 1) { + printusage(); + return -1; + } + break; + case 'l': + loop = atoi(optarg); + if (loop < 1) { + printusage(); + return -1; + } + break; + case 'u': + uniform = true; + break; + default: + abort(); + } + if (howmany == 0) { + howmany = 5; + } + cout << "# howmany : " << howmany << endl; + cout << "# loop : " << loop << endl; + cout << "# distribution : " << (uniform ? "uniform" : "clustered") << endl; + cout << "# Big : " << Big << endl; + cout << "# intersectionratio : " << intersectionratio << endl; + cout << "# MaxBit : " << MaxBit << endl; + UniformDataGenerator udg; + ClusteredDataGenerator cdg; + WallClockTimer z; + size_t bogus = 0; + vector buffer(2 * (1U << Big)); #ifdef LIKWID_MARKERS - char currentMarker[64]; - likwid_markerInit(); + char currentMarker[64]; + likwid_markerInit(); #endif - cout << "# size-ratio\t"; - for (string intername : IntersectionFactory::allNames()) { - cout << intername << "\t"; - } - cout - << " partioned (Schlegel et al.: improved, original) 16-bitV1 16-bitscalar "; - cout << "relative-intersection-size " << endl; - - for (float ir = 1.001; ir <= 10000; ir = ir * sqrt(1.9)) { - vector < pair, vector>> data(howmany); - uint32_t smallsize = static_cast(round( - static_cast(1 << Big) / ir)); - cout << "#generating data..."; - cout.flush(); - for (size_t k = 0; k < howmany; ++k) { - data[k] = - uniform ? - getNaivePair(udg, smallsize, 1U << MaxBit, ir, - intersectionratio) : - getNaivePair(cdg, smallsize, 1U << MaxBit, ir, - intersectionratio); - } - cout << "ok." << endl; - cout << "#partitions..."; - vector < pair, vector>> datapart(howmany); - for (size_t k = 0; k < howmany; ++k) { - vector < uint16_t > part1(data[k].first.size() * 4); - size_t p1length = partition(data[k].first.data(), - data[k].first.size(), part1.data(), part1.size()); - part1.resize(p1length); - part1.shrink_to_fit(); - vector < uint16_t > part2(data[k].second.size() * 4); - size_t p2length = partition(data[k].second.data(), - data[k].second.size(), part2.data(), part2.size()); - part2.resize(p2length); - part2.shrink_to_fit(); - datapart[k] = make_pair(part1, part2); - } - cout << "ok." << endl; - - cout << ir << "\t"; - float aratio = 0.0f; - for (string intername : IntersectionFactory::allNames()) { - intersectionfunction interfnc = IntersectionFactory::getFromName( - intername); - size_t volume = 0; + cout << "# size-ratio\t"; + for (string intername : IntersectionFactory::allNames()) { + cout << intername << "\t"; + } + cout << " partioned (Schlegel et al.: improved, original) 16-bitV1 " + "16-bitscalar "; + cout << "relative-intersection-size " << endl; + + for (float ir = 1.001; ir <= 10000; ir = ir * sqrt(1.9)) { + vector, vector>> data(howmany); + uint32_t smallsize = + static_cast(round(static_cast(1 << Big) / ir)); + cout << "#generating data..."; + cout.flush(); + for (size_t k = 0; k < howmany; ++k) { + data[k] = uniform ? getNaivePair(udg, smallsize, 1U << MaxBit, ir, + intersectionratio) + : getNaivePair(cdg, smallsize, 1U << MaxBit, ir, + intersectionratio); + } + cout << "ok." << endl; + cout << "#partitions..."; + vector, vector>> datapart(howmany); + for (size_t k = 0; k < howmany; ++k) { + vector part1(data[k].first.size() * 4); + size_t p1length = partition(data[k].first.data(), data[k].first.size(), + part1.data(), part1.size()); + part1.resize(p1length); + part1.shrink_to_fit(); + vector part2(data[k].second.size() * 4); + size_t p2length = partition(data[k].second.data(), data[k].second.size(), + part2.data(), part2.size()); + part2.resize(p2length); + part2.shrink_to_fit(); + datapart[k] = make_pair(part1, part2); + } + cout << "ok." << endl; + + cout << ir << "\t"; + float aratio = 0.0f; + for (string intername : IntersectionFactory::allNames()) { + intersectionfunction interfnc = + IntersectionFactory::getFromName(intername); + size_t volume = 0; #ifdef LIKWID_MARKERS - snprintf(currentMarker, sizeof(currentMarker), "%s %.2f", intername.c_str(), ir); - likwid_markerStartRegion(currentMarker); + snprintf(currentMarker, sizeof(currentMarker), "%s %.2f", + intername.c_str(), ir); + likwid_markerStartRegion(currentMarker); #endif - z.reset(); - for (size_t k = 0; k < data.size(); ++k) { - volume += (data[k].first.size() + data[k].second.size()) * loop; - for (size_t L = 0; L < loop; ++L) { - aratio = interfnc(data[k].first.data(), - (data[k].first).size(), data[k].second.data(), - (data[k].second).size(), buffer.data()); - bogus += aratio; - } - } - cout << setw(10) << setprecision(5) - << (volume / (static_cast(z.split()))) << "\t"; + z.reset(); + for (size_t k = 0; k < data.size(); ++k) { + volume += (data[k].first.size() + data[k].second.size()) * loop; + for (size_t L = 0; L < loop; ++L) { + aratio = interfnc(data[k].first.data(), (data[k].first).size(), + data[k].second.data(), (data[k].second).size(), + buffer.data()); + bogus += aratio; + } + } + cout << setw(10) << setprecision(5) + << (volume / (static_cast(z.split()))) << "\t"; #ifdef LIKWID_MARKERS - likwid_markerStopRegion(currentMarker); + likwid_markerStopRegion(currentMarker); #endif - } - z.reset(); - size_t volume = 0; - for (size_t k = 0; k < data.size(); ++k) { - volume += (data[k].first.size() + data[k].second.size()) * loop; - for (size_t L = 0; L < loop; ++L) { - aratio = intersect_partitioned(datapart[k].first.data(), - (datapart[k].first).size(), datapart[k].second.data(), - (datapart[k].second).size(), - (uint16_t *) buffer.data()); - bogus += aratio; - } - } - cout << setw(10) << setprecision(5) - << (volume / (static_cast(z.split()))) << "\t"; - z.reset(); - volume = 0; - for (size_t k = 0; k < data.size(); ++k) { - volume += (data[k].first.size() + data[k].second.size()) * loop; - for (size_t L = 0; L < loop; ++L) { - aratio = original_intersect_partitioned( - datapart[k].first.data(), (datapart[k].first).size(), - datapart[k].second.data(), (datapart[k].second).size(), - (uint16_t *) buffer.data()); - bogus += aratio; - } - } - cout << setw(10) << setprecision(5) - << (volume / (static_cast(z.split()))) << "\t"; - z.reset(); - volume = 0; - for (size_t k = 0; k < data.size(); ++k) { - volume += (data[k].first.size() + data[k].second.size()) * loop; - for (size_t L = 0; L < loop; ++L) { - aratio = intersect_partitionedV1(datapart[k].first.data(), - (datapart[k].first).size(), datapart[k].second.data(), - (datapart[k].second).size(), - (uint16_t *) buffer.data()); - bogus += aratio; - } - } - cout << setw(10) << setprecision(5) - << (volume / (static_cast(z.split()))) << "\t"; - z.reset(); - volume = 0; - for (size_t k = 0; k < data.size(); ++k) { - volume += (data[k].first.size() + data[k].second.size()) * loop; - for (size_t L = 0; L < loop; ++L) { - aratio = intersect_partitionedscalar(datapart[k].first.data(), - (datapart[k].first).size(), datapart[k].second.data(), - (datapart[k].second).size(), - (uint16_t *) buffer.data()); - bogus += aratio; - } - } - cout << setw(10) << setprecision(5) - << (volume / (static_cast(z.split()))) << "\t"; - cout << "\t\t" << aratio / smallsize; - cout << endl; - - } + } + z.reset(); + size_t volume = 0; + for (size_t k = 0; k < data.size(); ++k) { + volume += (data[k].first.size() + data[k].second.size()) * loop; + for (size_t L = 0; L < loop; ++L) { + aratio = intersect_partitioned( + datapart[k].first.data(), (datapart[k].first).size(), + datapart[k].second.data(), (datapart[k].second).size(), + (uint16_t *)buffer.data()); + bogus += aratio; + } + } + cout << setw(10) << setprecision(5) + << (volume / (static_cast(z.split()))) << "\t"; + z.reset(); + volume = 0; + for (size_t k = 0; k < data.size(); ++k) { + volume += (data[k].first.size() + data[k].second.size()) * loop; + for (size_t L = 0; L < loop; ++L) { + aratio = original_intersect_partitioned( + datapart[k].first.data(), (datapart[k].first).size(), + datapart[k].second.data(), (datapart[k].second).size(), + (uint16_t *)buffer.data()); + bogus += aratio; + } + } + cout << setw(10) << setprecision(5) + << (volume / (static_cast(z.split()))) << "\t"; + z.reset(); + volume = 0; + for (size_t k = 0; k < data.size(); ++k) { + volume += (data[k].first.size() + data[k].second.size()) * loop; + for (size_t L = 0; L < loop; ++L) { + aratio = intersect_partitionedV1( + datapart[k].first.data(), (datapart[k].first).size(), + datapart[k].second.data(), (datapart[k].second).size(), + (uint16_t *)buffer.data()); + bogus += aratio; + } + } + cout << setw(10) << setprecision(5) + << (volume / (static_cast(z.split()))) << "\t"; + z.reset(); + volume = 0; + for (size_t k = 0; k < data.size(); ++k) { + volume += (data[k].first.size() + data[k].second.size()) * loop; + for (size_t L = 0; L < loop; ++L) { + aratio = intersect_partitionedscalar( + datapart[k].first.data(), (datapart[k].first).size(), + datapart[k].second.data(), (datapart[k].second).size(), + (uint16_t *)buffer.data()); + bogus += aratio; + } + } + cout << setw(10) << setprecision(5) + << (volume / (static_cast(z.split()))) << "\t"; + cout << "\t\t" << aratio / smallsize; + cout << endl; + } #ifdef LIKWID_MARKERS - likwid_markerClose(); + likwid_markerClose(); #endif - cout << "# bogus = " << bogus << endl; + cout << "# bogus = " << bogus << endl; } diff --git a/src/benchsearch.cpp b/src/benchsearch.cpp index 773010f..a86cdd0 100644 --- a/src/benchsearch.cpp +++ b/src/benchsearch.cpp @@ -6,435 +6,467 @@ #include "codecfactory.h" #ifdef _MSC_VER -# include +#include __int64 freq; typedef __int64 time_snap_t; -static time_snap_t time_snap(void) -{ - __int64 now; +static time_snap_t time_snap(void) { + __int64 now; - QueryPerformanceCounter((LARGE_INTEGER *)&now); + QueryPerformanceCounter((LARGE_INTEGER *)&now); - return (__int64)((now*1000000)/freq); + return (__int64)((now * 1000000) / freq); } -# define TIME_SNAP_FMT "%I64d" +#define TIME_SNAP_FMT "%I64d" #else -# define time_snap clock -# define TIME_SNAP_FMT "%lu" +#define time_snap clock +#define TIME_SNAP_FMT "%lu" typedef clock_t time_snap_t; #endif -int uint32_cmp(const void *a, const void *b) -{ - const uint32_t *ia = (const uint32_t *)a; - const uint32_t *ib = (const uint32_t *)b; - if(*ia < *ib) - return -1; - else if (*ia > *ib) - return 1; - return 0; +int uint32_cmp(const void *a, const void *b) { + const uint32_t *ia = (const uint32_t *)a; + const uint32_t *ib = (const uint32_t *)b; + if (*ia < *ib) + return -1; + else if (*ia > *ib) + return 1; + return 0; } +template int benchmarkSelect() { + T codec; + static const size_t N = 256; + uint32_t buffer[N]; + uint32_t backbuffer[N]; + uint32_t b; + time_snap_t S1, S2, S3; + size_t i; + printf("# benchmarking select %s \n", codec.name().c_str()); + srand(0); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t out[N * 2]; + uint32_t prev = 0; + /* initialize the buffer */ + for (i = 0; i < N; i++) { + buffer[i] = ((uint32_t)rand()); + if (b < 32) + buffer[i] %= (1 << b); + } + for (i = 0; i < N; i++) { + buffer[i] = buffer[i] + prev; + prev = buffer[i]; + } + qsort(buffer, N, sizeof(uint32_t), uint32_cmp); -template -int benchmarkSelect() { - T codec; - static const size_t N = 256; - uint32_t buffer[N]; - uint32_t backbuffer[N]; - uint32_t b; - time_snap_t S1, S2, S3; - size_t i; - printf("# benchmarking select %s \n",codec.name().c_str()); - srand(0); - - /* this test creates delta encoded buffers with different bits, then - * performs lower bound searches for each key */ - for (b = 0; b <= 32; b++) { - uint32_t out[N * 2]; - uint32_t prev = 0; - /* initialize the buffer */ - for (i = 0; i < N; i++) { - buffer[i] = ((uint32_t)rand()) ; - if(b < 32) buffer[i] %= (1< key) { - imax = imid; - } else if (A[imid] < key) { - imin = imid; - } else { - return imid; - } +int binary_search(uint32_t *A, uint32_t key, int imin, int imax) { + int imid; + imax--; + while (imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] > key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; + } else { + return imid; } - return imax; + } + return imax; } - /* adapted from wikipedia */ -int lower_bound(uint32_t * A, uint32_t key, int imin, int imax) -{ - int imid; - imax --; - while(imin + 1 < imax) { - imid = imin + ((imax - imin) / 2); - - if (A[imid] >= key) { - imax = imid; - } else if (A[imid] < key) { - imin = imid; - } +int lower_bound(uint32_t *A, uint32_t key, int imin, int imax) { + int imid; + imax--; + while (imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] >= key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; } - if(A[imin] >= key) return imin; - return imax; + } + if (A[imin] >= key) + return imin; + return imax; } +template int benchmarkSearch() { + T codec; + uint32_t buffer[N]; + uint32_t backbuffer[N]; + uint32_t out[N * 2]; + uint32_t result, initial = 0; + uint32_t b, i; + time_snap_t S1, S2, S3; + printf("# benchmarking search %s N=%lu \n", codec.name().c_str(), N); + srand(0); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t prev = initial; + /* initialize the buffer */ + for (i = 0; i < N; i++) { + buffer[i] = ((uint32_t)rand()); + if (b < 32) + buffer[i] %= (1 << b); + } + for (i = 0; i < N; i++) { + buffer[i] = buffer[i] + prev; + prev = buffer[i]; + } -template -int benchmarkSearch() { - T codec; - uint32_t buffer[N]; - uint32_t backbuffer[N]; - uint32_t out[N * 2]; - uint32_t result, initial = 0; - uint32_t b, i; - time_snap_t S1, S2, S3; - printf("# benchmarking search %s N=%lu \n",codec.name().c_str(),N); - srand(0); - - /* this test creates delta encoded buffers with different bits, then - * performs lower bound searches for each key */ - for (b = 0; b <= 32; b++) { - uint32_t prev = initial; - /* initialize the buffer */ - for (i = 0; i < N; i++) { - buffer[i] = ((uint32_t)rand()) ; - if(b < 32) buffer[i] %= (1< 0) { - if(buffer[pos-1] >= pseudorandomkey) { - printf("bug B.\n"); - return -1; - } - } + for (i = 0; i < N; i++) { + out[i] = 0; /* memset would do too */ + } + size_t nvalue = N * 2; + + codec.encodeArray(buffer, N, out, nvalue); + { + size_t recovlength = N; + codec.decodeArray(out, nvalue, backbuffer, recovlength); + assert(recovlength == N); + for (size_t k = 0; k < N; k++) { + assert(backbuffer[k] == buffer[k]); + } + } + S1 = time_snap(); + for (i = 0; i < N * 10; i++) { + uint32_t pseudorandomkey = buffer[i % N]; + uint32_t result = 0; + size_t pos = codec.findLowerBound(out, nvalue, pseudorandomkey, &result); + if ((result < pseudorandomkey) || (buffer[pos] != result)) { + printf("bug A: pseudorandomkey = %u result = %u buffer[%zu] = %u .\n", + pseudorandomkey, result, pos, buffer[pos]); + return -1; + } else if (pos > 0) { + if (buffer[pos - 1] >= pseudorandomkey) { + printf("bug B.\n"); + return -1; } - S2 = time_snap(); - for (i = 0; i < N * 10; i++) { - int pos; - uint32_t pseudorandomkey = buffer[i%N]; - size_t recovlength = N; - codec.decodeArray(out,nvalue,backbuffer,recovlength); - pos = lower_bound(backbuffer, pseudorandomkey, 0, N); - result = backbuffer[pos]; - - if((result < pseudorandomkey) || (buffer[pos] != result)) { - printf("bug C.\n"); - return -1; - } else if (pos > 0) { - if(buffer[pos-1] >= pseudorandomkey) { - printf("bug D.\n"); - return -1; - } - } + } + } + S2 = time_snap(); + for (i = 0; i < N * 10; i++) { + int pos; + uint32_t pseudorandomkey = buffer[i % N]; + size_t recovlength = N; + codec.decodeArray(out, nvalue, backbuffer, recovlength); + pos = lower_bound(backbuffer, pseudorandomkey, 0, N); + result = backbuffer[pos]; + + if ((result < pseudorandomkey) || (buffer[pos] != result)) { + printf("bug C.\n"); + return -1; + } else if (pos > 0) { + if (buffer[pos - 1] >= pseudorandomkey) { + printf("bug D.\n"); + return -1; } - S3 = time_snap(); - printf("# bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT ", following line is in millions of operation per second \n", b, (S2-S1), (S3-S2) ); - printf("%d %f %f %f \n",b, N * 10.0 / (S2 - S1), N * 10.0 / (S3 - S2),nvalue*32.0/N); + } } - printf("\n\n"); - return 0; + S3 = time_snap(); + printf("# bit width = %d, fast search function time = " TIME_SNAP_FMT + ", naive time = " TIME_SNAP_FMT + ", following line is in millions of operation per second \n", + b, (S2 - S1), (S3 - S2)); + printf("%d %f %f %f \n", b, N * 10.0 / (S2 - S1), N * 10.0 / (S3 - S2), + nvalue * 32.0 / N); + } + printf("\n\n"); + return 0; } +template int benchmarkInsert() { + T codec; + const int max = 256; + srand(0); + time_snap_t S1, S2, S3; + printf("# benchmarking insert %s \n", codec.name().c_str()); + // encode in a buffer + std::vector compressedbuffer1(max * sizeof(uint32_t) + 1024); + std::vector compressedbuffer2(max * sizeof(uint32_t) + 1024); + + std::vector buffer(max); + std::vector backbuffer(max); + + for (int b = 0; b <= 32; b++) { + compressedbuffer1.resize(compressedbuffer1.capacity()); + compressedbuffer1[0] = 0; // required for streamvbyte + compressedbuffer1[1] = 0; + compressedbuffer2[0] = 0; + compressedbuffer2[1] = 0; + compressedbuffer2.resize(compressedbuffer2.capacity()); + + /* initialize the buffer */ + for (int i = 0; i < max; i++) { + buffer[i] = ((uint32_t)rand()); + if (b < 32) + buffer[i] %= (1 << b); + } + S1 = time_snap(); -template -int benchmarkInsert() { - T codec; - const int max = 256; - srand(0); - time_snap_t S1, S2, S3; - printf("# benchmarking insert %s \n",codec.name().c_str()); - // encode in a buffer - std::vector < uint32_t > compressedbuffer1(max * sizeof(uint32_t) + 1024); - std::vector < uint32_t > compressedbuffer2(max * sizeof(uint32_t) + 1024); - - std::vector < uint32_t > buffer(max); - std::vector < uint32_t > backbuffer(max); - - for (int b = 0; b <= 32; b++) { - compressedbuffer1.resize(compressedbuffer1.capacity()); - compressedbuffer1[0] = 0; // required for streamvbyte - compressedbuffer1[1] = 0; - compressedbuffer2[0] = 0; - compressedbuffer2[1] = 0; - compressedbuffer2.resize(compressedbuffer2.capacity()); - - /* initialize the buffer */ - for (int i = 0; i < max; i++) { - buffer[i] = ((uint32_t) rand()); - if (b < 32) - buffer[i] %= (1 << b); - } - S1 = time_snap(); - - size_t currentsize1 = 0; - for (int i = 0; i < max; i++) { - currentsize1 = codec.insert(compressedbuffer1.data(), currentsize1, - buffer[i]); - } - S2 = time_snap(); - - size_t currentsize2 = 0; - for (int i = 0; i < max; i++) { - size_t recovlength = backbuffer.size(); - codec.decodeArray(compressedbuffer2.data(),currentsize2,backbuffer.data(),recovlength); - auto it = lower_bound(backbuffer.begin(), backbuffer.begin() + recovlength, buffer[i]); - backbuffer.insert(it, buffer[i]); - currentsize2 = compressedbuffer2.size(); - codec.encodeArray(backbuffer.data(), recovlength+1,compressedbuffer2.data(),currentsize2); - } - S3 = time_snap(); - - printf("# bit width = %d, fast insert function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2) ); - printf("%d " TIME_SNAP_FMT " " TIME_SNAP_FMT " \n",b, (S2-S1), (S3-S2)); - if(currentsize1 != currentsize2) { - printf("bug A: %u != %u\n", (uint32_t)currentsize1, - (uint32_t)currentsize2); - return -1; - } - compressedbuffer1.resize(currentsize1); - compressedbuffer2.resize(currentsize2); - if(compressedbuffer1 != compressedbuffer2) { - printf("bug B\n"); - return -1; - } - - } - - printf("\n\n"); - return 0; + size_t currentsize1 = 0; + for (int i = 0; i < max; i++) { + currentsize1 = + codec.insert(compressedbuffer1.data(), currentsize1, buffer[i]); + } + S2 = time_snap(); + + size_t currentsize2 = 0; + for (int i = 0; i < max; i++) { + size_t recovlength = backbuffer.size(); + codec.decodeArray(compressedbuffer2.data(), currentsize2, + backbuffer.data(), recovlength); + auto it = lower_bound(backbuffer.begin(), + backbuffer.begin() + recovlength, buffer[i]); + backbuffer.insert(it, buffer[i]); + currentsize2 = compressedbuffer2.size(); + codec.encodeArray(backbuffer.data(), recovlength + 1, + compressedbuffer2.data(), currentsize2); + } + S3 = time_snap(); + + printf("# bit width = %d, fast insert function time = " TIME_SNAP_FMT + ", naive time = " TIME_SNAP_FMT " \n", + b, (S2 - S1), (S3 - S2)); + printf("%d " TIME_SNAP_FMT " " TIME_SNAP_FMT " \n", b, (S2 - S1), + (S3 - S2)); + if (currentsize1 != currentsize2) { + printf("bug A: %u != %u\n", (uint32_t)currentsize1, + (uint32_t)currentsize2); + return -1; + } + compressedbuffer1.resize(currentsize1); + compressedbuffer2.resize(currentsize2); + if (compressedbuffer1 != compressedbuffer2) { + printf("bug B\n"); + return -1; + } + } + printf("\n\n"); + return 0; } // same as testAppend<>, but uses encodeByteArray/decodeByteArray // to avoid padding -template -int benchmarkAppendToByteArray() { - T codec; - const int max = 256; - srand(0); - time_snap_t S1, S2, S3; - printf("# benchmarking insert %s \n",codec.name().c_str()); - // encode in a buffer - std::vector < uint32_t > compressedbuffer1(max * sizeof(uint32_t) + 1024); - std::vector < uint32_t > compressedbuffer2(max * sizeof(uint32_t) + 1024); - - std::vector < uint32_t > buffer(max); - std::vector < uint32_t > backbuffer(max); - - for (int b = 0; b <= 32; b++) { - compressedbuffer1.resize(compressedbuffer1.capacity()); - compressedbuffer1[0] = 0; // required for streamvbyte - compressedbuffer1[1] = 0; - compressedbuffer2[0] = 0; - compressedbuffer2[1] = 0; - compressedbuffer2.resize(compressedbuffer2.capacity()); - - /* initialize the buffer */ - for (int i = 0; i < max; i++) { - buffer[i] = ((uint32_t) rand()); - if (b < 32) - buffer[i] %= (1 << b); - } - S1 = time_snap(); - - size_t currentsize1 = 0; - for (int i = 0; i < max; i++) { - currentsize1 = codec.appendToByteArray((uint8_t *)compressedbuffer1.data(), currentsize1, - i == 0 ? 0 : buffer[i-1],buffer[i]); - } - S2 = time_snap(); - - size_t currentsize2 = 0; - for (int i = 0; i < max; i++) { - size_t recovlength = backbuffer.size(); - codec.decodeArray(compressedbuffer2.data(),currentsize2,backbuffer.data(),recovlength); - backbuffer[recovlength] = buffer[i]; - currentsize2 = compressedbuffer2.size(); - codec.encodeArray(backbuffer.data(), recovlength+1,compressedbuffer2.data(),currentsize2); - } - S3 = time_snap(); - - printf("# bit width = %d, fast append function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2) ); - printf("%d " TIME_SNAP_FMT " " TIME_SNAP_FMT " \n",b, (S2-S1), (S3-S2)); - } - - printf("\n\n"); - return 0; -} +template int benchmarkAppendToByteArray() { + T codec; + const int max = 256; + srand(0); + time_snap_t S1, S2, S3; + printf("# benchmarking insert %s \n", codec.name().c_str()); + // encode in a buffer + std::vector compressedbuffer1(max * sizeof(uint32_t) + 1024); + std::vector compressedbuffer2(max * sizeof(uint32_t) + 1024); + + std::vector buffer(max); + std::vector backbuffer(max); + + for (int b = 0; b <= 32; b++) { + compressedbuffer1.resize(compressedbuffer1.capacity()); + compressedbuffer1[0] = 0; // required for streamvbyte + compressedbuffer1[1] = 0; + compressedbuffer2[0] = 0; + compressedbuffer2[1] = 0; + compressedbuffer2.resize(compressedbuffer2.capacity()); + + /* initialize the buffer */ + for (int i = 0; i < max; i++) { + buffer[i] = ((uint32_t)rand()); + if (b < 32) + buffer[i] %= (1 << b); + } + S1 = time_snap(); + size_t currentsize1 = 0; + for (int i = 0; i < max; i++) { + currentsize1 = codec.appendToByteArray( + (uint8_t *)compressedbuffer1.data(), currentsize1, + i == 0 ? 0 : buffer[i - 1], buffer[i]); + } + S2 = time_snap(); + + size_t currentsize2 = 0; + for (int i = 0; i < max; i++) { + size_t recovlength = backbuffer.size(); + codec.decodeArray(compressedbuffer2.data(), currentsize2, + backbuffer.data(), recovlength); + backbuffer[recovlength] = buffer[i]; + currentsize2 = compressedbuffer2.size(); + codec.encodeArray(backbuffer.data(), recovlength + 1, + compressedbuffer2.data(), currentsize2); + } + S3 = time_snap(); + printf("# bit width = %d, fast append function time = " TIME_SNAP_FMT + ", naive time = " TIME_SNAP_FMT " \n", + b, (S2 - S1), (S3 - S2)); + printf("%d " TIME_SNAP_FMT " " TIME_SNAP_FMT " \n", b, (S2 - S1), + (S3 - S2)); + } + + printf("\n\n"); + return 0; +} using namespace SIMDCompressionLib; int main() { - int r = 0; + int r = 0; #ifdef _MSC_VER - QueryPerformanceFrequency((LARGE_INTEGER *)&freq); + QueryPerformanceFrequency((LARGE_INTEGER *)&freq); #endif - r = benchmarkInsert(); - if(r < 0) return r; - r = benchmarkInsert>(); - if(r < 0) return r; - r = benchmarkInsert>(); - if(r < 0) return r; - r = benchmarkInsert>(); - if(r < 0) return r; - - r = benchmarkSearch(); - if(r < 0) return r; - r = benchmarkSearch(); - if(r < 0) return r; - r = benchmarkSearch(); - if(r < 0) return r; - r = benchmarkSearch>(); - if(r < 0) return r; - r = benchmarkSearch>(); - if(r < 0) return r; - r = benchmarkSearch>(); - if(r < 0) return r; - r = benchmarkSearch>(); - if(r < 0) return r; - r = benchmarkSearch>>(); - if(r < 0) return r; - r = benchmarkSearch(); - if(r < 0) return r; - - r = benchmarkSelect(); - if(r < 0) return r; - r = benchmarkSelect(); - if(r < 0) return r; - r = benchmarkSelect(); - if(r < 0) return r; - r = benchmarkSelect>(); - if(r < 0) return r; - r = benchmarkSelect>(); - if(r < 0) return r; - r = benchmarkSelect>(); - if(r < 0) return r; - r = benchmarkSelect>(); - if(r < 0) return r; - r = benchmarkSelect>>(); - if(r < 0) return r; - r = benchmarkSelect(); - if(r < 0) return r; - - - r = benchmarkAppendToByteArray(); - if(r < 0) return r; - r = benchmarkAppendToByteArray(); - if(r < 0) return r; - r = benchmarkAppendToByteArray(); - if(r < 0) return r; - r = benchmarkAppendToByteArray>(); - if(r < 0) return r; - r = benchmarkAppendToByteArray>(); - if(r < 0) return r; - r = benchmarkAppendToByteArray>(); - if(r < 0) return r; - r = benchmarkAppendToByteArray>(); - if(r < 0) return r; - r = benchmarkAppendToByteArray(); - if(r < 0) return r; - - - - r = benchmarkSearch,128>(); - if(r < 0) return r; - return 0; + r = benchmarkInsert(); + if (r < 0) + return r; + r = benchmarkInsert>(); + if (r < 0) + return r; + r = benchmarkInsert>(); + if (r < 0) + return r; + r = benchmarkInsert>(); + if (r < 0) + return r; + + r = benchmarkSearch(); + if (r < 0) + return r; + r = benchmarkSearch(); + if (r < 0) + return r; + r = benchmarkSearch(); + if (r < 0) + return r; + r = benchmarkSearch>(); + if (r < 0) + return r; + r = benchmarkSearch>(); + if (r < 0) + return r; + r = benchmarkSearch>(); + if (r < 0) + return r; + r = benchmarkSearch>(); + if (r < 0) + return r; + r = benchmarkSearch< + SIMDBinaryPacking>>(); + if (r < 0) + return r; + r = benchmarkSearch(); + if (r < 0) + return r; + + r = benchmarkSelect(); + if (r < 0) + return r; + r = benchmarkSelect(); + if (r < 0) + return r; + r = benchmarkSelect(); + if (r < 0) + return r; + r = benchmarkSelect>(); + if (r < 0) + return r; + r = benchmarkSelect>(); + if (r < 0) + return r; + r = benchmarkSelect>(); + if (r < 0) + return r; + r = benchmarkSelect>(); + if (r < 0) + return r; + r = benchmarkSelect< + SIMDBinaryPacking>>(); + if (r < 0) + return r; + r = benchmarkSelect(); + if (r < 0) + return r; + + r = benchmarkAppendToByteArray(); + if (r < 0) + return r; + r = benchmarkAppendToByteArray(); + if (r < 0) + return r; + r = benchmarkAppendToByteArray(); + if (r < 0) + return r; + r = benchmarkAppendToByteArray>(); + if (r < 0) + return r; + r = benchmarkAppendToByteArray>(); + if (r < 0) + return r; + r = benchmarkAppendToByteArray>(); + if (r < 0) + return r; + r = benchmarkAppendToByteArray>(); + if (r < 0) + return r; + r = benchmarkAppendToByteArray(); + if (r < 0) + return r; + + r = benchmarkSearch, 128>(); + if (r < 0) + return r; + return 0; } - - - - diff --git a/src/bitpacking.cpp b/src/bitpacking.cpp index eba01a8..5d83154 100644 --- a/src/bitpacking.cpp +++ b/src/bitpacking.cpp @@ -4,9229 +4,9038 @@ * */ - #include "bitpacking.h" namespace SIMDCompressionLib { +void __fastunpack0(const uint32_t *__restrict__, uint32_t *__restrict__ out) { + for (uint32_t i = 0; i < 32; ++i) + *(out++) = 0; +} +void __fastpack0(const uint32_t *__restrict__, uint32_t *__restrict__) {} +void __fastpackwithoutmask0(const uint32_t *__restrict__, + uint32_t *__restrict__) {} + +void __fastunpack32(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; +} +void __fastpack32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; +} -void __fastunpack0(const uint32_t *__restrict__ , uint32_t *__restrict__ out) { - for (uint32_t i = 0; i < 32; ++i) - *(out++) = 0; +void __fastpackwithoutmask32(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; } -void __fastpack0(const uint32_t *__restrict__ , uint32_t *__restrict__) { + +void __fastunpack1(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) & 1; + out++; + *out = ((*in) >> 1) & 1; + out++; + *out = ((*in) >> 2) & 1; + out++; + *out = ((*in) >> 3) & 1; + out++; + *out = ((*in) >> 4) & 1; + out++; + *out = ((*in) >> 5) & 1; + out++; + *out = ((*in) >> 6) & 1; + out++; + *out = ((*in) >> 7) & 1; + out++; + *out = ((*in) >> 8) & 1; + out++; + *out = ((*in) >> 9) & 1; + out++; + *out = ((*in) >> 10) & 1; + out++; + *out = ((*in) >> 11) & 1; + out++; + *out = ((*in) >> 12) & 1; + out++; + *out = ((*in) >> 13) & 1; + out++; + *out = ((*in) >> 14) & 1; + out++; + *out = ((*in) >> 15) & 1; + out++; + *out = ((*in) >> 16) & 1; + out++; + *out = ((*in) >> 17) & 1; + out++; + *out = ((*in) >> 18) & 1; + out++; + *out = ((*in) >> 19) & 1; + out++; + *out = ((*in) >> 20) & 1; + out++; + *out = ((*in) >> 21) & 1; + out++; + *out = ((*in) >> 22) & 1; + out++; + *out = ((*in) >> 23) & 1; + out++; + *out = ((*in) >> 24) & 1; + out++; + *out = ((*in) >> 25) & 1; + out++; + *out = ((*in) >> 26) & 1; + out++; + *out = ((*in) >> 27) & 1; + out++; + *out = ((*in) >> 28) & 1; + out++; + *out = ((*in) >> 29) & 1; + out++; + *out = ((*in) >> 30) & 1; + out++; + *out = ((*in) >> 31); } -void __fastpackwithoutmask0(const uint32_t *__restrict__ , uint32_t *__restrict__) { + +void __fastunpack2(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 2); + out++; + *out = ((*in) >> 2) % (1U << 2); + out++; + *out = ((*in) >> 4) % (1U << 2); + out++; + *out = ((*in) >> 6) % (1U << 2); + out++; + *out = ((*in) >> 8) % (1U << 2); + out++; + *out = ((*in) >> 10) % (1U << 2); + out++; + *out = ((*in) >> 12) % (1U << 2); + out++; + *out = ((*in) >> 14) % (1U << 2); + out++; + *out = ((*in) >> 16) % (1U << 2); + out++; + *out = ((*in) >> 18) % (1U << 2); + out++; + *out = ((*in) >> 20) % (1U << 2); + out++; + *out = ((*in) >> 22) % (1U << 2); + out++; + *out = ((*in) >> 24) % (1U << 2); + out++; + *out = ((*in) >> 26) % (1U << 2); + out++; + *out = ((*in) >> 28) % (1U << 2); + out++; + *out = ((*in) >> 30); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 2); + out++; + *out = ((*in) >> 2) % (1U << 2); + out++; + *out = ((*in) >> 4) % (1U << 2); + out++; + *out = ((*in) >> 6) % (1U << 2); + out++; + *out = ((*in) >> 8) % (1U << 2); + out++; + *out = ((*in) >> 10) % (1U << 2); + out++; + *out = ((*in) >> 12) % (1U << 2); + out++; + *out = ((*in) >> 14) % (1U << 2); + out++; + *out = ((*in) >> 16) % (1U << 2); + out++; + *out = ((*in) >> 18) % (1U << 2); + out++; + *out = ((*in) >> 20) % (1U << 2); + out++; + *out = ((*in) >> 22) % (1U << 2); + out++; + *out = ((*in) >> 24) % (1U << 2); + out++; + *out = ((*in) >> 26) % (1U << 2); + out++; + *out = ((*in) >> 28) % (1U << 2); + out++; + *out = ((*in) >> 30); } +void __fastunpack3(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 3); + out++; + *out = ((*in) >> 3) % (1U << 3); + out++; + *out = ((*in) >> 6) % (1U << 3); + out++; + *out = ((*in) >> 9) % (1U << 3); + out++; + *out = ((*in) >> 12) % (1U << 3); + out++; + *out = ((*in) >> 15) % (1U << 3); + out++; + *out = ((*in) >> 18) % (1U << 3); + out++; + *out = ((*in) >> 21) % (1U << 3); + out++; + *out = ((*in) >> 24) % (1U << 3); + out++; + *out = ((*in) >> 27) % (1U << 3); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 1)) << (3 - 1); + out++; + *out = ((*in) >> 1) % (1U << 3); + out++; + *out = ((*in) >> 4) % (1U << 3); + out++; + *out = ((*in) >> 7) % (1U << 3); + out++; + *out = ((*in) >> 10) % (1U << 3); + out++; + *out = ((*in) >> 13) % (1U << 3); + out++; + *out = ((*in) >> 16) % (1U << 3); + out++; + *out = ((*in) >> 19) % (1U << 3); + out++; + *out = ((*in) >> 22) % (1U << 3); + out++; + *out = ((*in) >> 25) % (1U << 3); + out++; + *out = ((*in) >> 28) % (1U << 3); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 2)) << (3 - 2); + out++; + *out = ((*in) >> 2) % (1U << 3); + out++; + *out = ((*in) >> 5) % (1U << 3); + out++; + *out = ((*in) >> 8) % (1U << 3); + out++; + *out = ((*in) >> 11) % (1U << 3); + out++; + *out = ((*in) >> 14) % (1U << 3); + out++; + *out = ((*in) >> 17) % (1U << 3); + out++; + *out = ((*in) >> 20) % (1U << 3); + out++; + *out = ((*in) >> 23) % (1U << 3); + out++; + *out = ((*in) >> 26) % (1U << 3); + out++; + *out = ((*in) >> 29); +} +void __fastunpack5(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 5); + out++; + *out = ((*in) >> 5) % (1U << 5); + out++; + *out = ((*in) >> 10) % (1U << 5); + out++; + *out = ((*in) >> 15) % (1U << 5); + out++; + *out = ((*in) >> 20) % (1U << 5); + out++; + *out = ((*in) >> 25) % (1U << 5); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 3)) << (5 - 3); + out++; + *out = ((*in) >> 3) % (1U << 5); + out++; + *out = ((*in) >> 8) % (1U << 5); + out++; + *out = ((*in) >> 13) % (1U << 5); + out++; + *out = ((*in) >> 18) % (1U << 5); + out++; + *out = ((*in) >> 23) % (1U << 5); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 1)) << (5 - 1); + out++; + *out = ((*in) >> 1) % (1U << 5); + out++; + *out = ((*in) >> 6) % (1U << 5); + out++; + *out = ((*in) >> 11) % (1U << 5); + out++; + *out = ((*in) >> 16) % (1U << 5); + out++; + *out = ((*in) >> 21) % (1U << 5); + out++; + *out = ((*in) >> 26) % (1U << 5); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 4)) << (5 - 4); + out++; + *out = ((*in) >> 4) % (1U << 5); + out++; + *out = ((*in) >> 9) % (1U << 5); + out++; + *out = ((*in) >> 14) % (1U << 5); + out++; + *out = ((*in) >> 19) % (1U << 5); + out++; + *out = ((*in) >> 24) % (1U << 5); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 2)) << (5 - 2); + out++; + *out = ((*in) >> 2) % (1U << 5); + out++; + *out = ((*in) >> 7) % (1U << 5); + out++; + *out = ((*in) >> 12) % (1U << 5); + out++; + *out = ((*in) >> 17) % (1U << 5); + out++; + *out = ((*in) >> 22) % (1U << 5); + out++; + *out = ((*in) >> 27); +} -void __fastunpack32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - for (int k = 0 ; k < 32 ; ++k) - out[k] = in[k]; +void __fastunpack6(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 6); + out++; + *out = ((*in) >> 6) % (1U << 6); + out++; + *out = ((*in) >> 12) % (1U << 6); + out++; + *out = ((*in) >> 18) % (1U << 6); + out++; + *out = ((*in) >> 24) % (1U << 6); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + out++; + *out = ((*in) >> 4) % (1U << 6); + out++; + *out = ((*in) >> 10) % (1U << 6); + out++; + *out = ((*in) >> 16) % (1U << 6); + out++; + *out = ((*in) >> 22) % (1U << 6); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + out++; + *out = ((*in) >> 2) % (1U << 6); + out++; + *out = ((*in) >> 8) % (1U << 6); + out++; + *out = ((*in) >> 14) % (1U << 6); + out++; + *out = ((*in) >> 20) % (1U << 6); + out++; + *out = ((*in) >> 26); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 6); + out++; + *out = ((*in) >> 6) % (1U << 6); + out++; + *out = ((*in) >> 12) % (1U << 6); + out++; + *out = ((*in) >> 18) % (1U << 6); + out++; + *out = ((*in) >> 24) % (1U << 6); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + out++; + *out = ((*in) >> 4) % (1U << 6); + out++; + *out = ((*in) >> 10) % (1U << 6); + out++; + *out = ((*in) >> 16) % (1U << 6); + out++; + *out = ((*in) >> 22) % (1U << 6); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + out++; + *out = ((*in) >> 2) % (1U << 6); + out++; + *out = ((*in) >> 8) % (1U << 6); + out++; + *out = ((*in) >> 14) % (1U << 6); + out++; + *out = ((*in) >> 20) % (1U << 6); + out++; + *out = ((*in) >> 26); } -void __fastpack32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - for (int k = 0 ; k < 32 ; ++k) - out[k] = in[k]; + +void __fastunpack7(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 7); + out++; + *out = ((*in) >> 7) % (1U << 7); + out++; + *out = ((*in) >> 14) % (1U << 7); + out++; + *out = ((*in) >> 21) % (1U << 7); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 3)) << (7 - 3); + out++; + *out = ((*in) >> 3) % (1U << 7); + out++; + *out = ((*in) >> 10) % (1U << 7); + out++; + *out = ((*in) >> 17) % (1U << 7); + out++; + *out = ((*in) >> 24) % (1U << 7); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 6)) << (7 - 6); + out++; + *out = ((*in) >> 6) % (1U << 7); + out++; + *out = ((*in) >> 13) % (1U << 7); + out++; + *out = ((*in) >> 20) % (1U << 7); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 2)) << (7 - 2); + out++; + *out = ((*in) >> 2) % (1U << 7); + out++; + *out = ((*in) >> 9) % (1U << 7); + out++; + *out = ((*in) >> 16) % (1U << 7); + out++; + *out = ((*in) >> 23) % (1U << 7); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 5)) << (7 - 5); + out++; + *out = ((*in) >> 5) % (1U << 7); + out++; + *out = ((*in) >> 12) % (1U << 7); + out++; + *out = ((*in) >> 19) % (1U << 7); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 1)) << (7 - 1); + out++; + *out = ((*in) >> 1) % (1U << 7); + out++; + *out = ((*in) >> 8) % (1U << 7); + out++; + *out = ((*in) >> 15) % (1U << 7); + out++; + *out = ((*in) >> 22) % (1U << 7); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 4)) << (7 - 4); + out++; + *out = ((*in) >> 4) % (1U << 7); + out++; + *out = ((*in) >> 11) % (1U << 7); + out++; + *out = ((*in) >> 18) % (1U << 7); + out++; + *out = ((*in) >> 25); } -void __fastpackwithoutmask32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - for (int k = 0 ; k < 32 ; ++k) - out[k] = in[k]; +void __fastunpack9(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 9); + out++; + *out = ((*in) >> 9) % (1U << 9); + out++; + *out = ((*in) >> 18) % (1U << 9); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 4)) << (9 - 4); + out++; + *out = ((*in) >> 4) % (1U << 9); + out++; + *out = ((*in) >> 13) % (1U << 9); + out++; + *out = ((*in) >> 22) % (1U << 9); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 8)) << (9 - 8); + out++; + *out = ((*in) >> 8) % (1U << 9); + out++; + *out = ((*in) >> 17) % (1U << 9); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 3)) << (9 - 3); + out++; + *out = ((*in) >> 3) % (1U << 9); + out++; + *out = ((*in) >> 12) % (1U << 9); + out++; + *out = ((*in) >> 21) % (1U << 9); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 7)) << (9 - 7); + out++; + *out = ((*in) >> 7) % (1U << 9); + out++; + *out = ((*in) >> 16) % (1U << 9); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 2)) << (9 - 2); + out++; + *out = ((*in) >> 2) % (1U << 9); + out++; + *out = ((*in) >> 11) % (1U << 9); + out++; + *out = ((*in) >> 20) % (1U << 9); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 6)) << (9 - 6); + out++; + *out = ((*in) >> 6) % (1U << 9); + out++; + *out = ((*in) >> 15) % (1U << 9); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 1)) << (9 - 1); + out++; + *out = ((*in) >> 1) % (1U << 9); + out++; + *out = ((*in) >> 10) % (1U << 9); + out++; + *out = ((*in) >> 19) % (1U << 9); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 5)) << (9 - 5); + out++; + *out = ((*in) >> 5) % (1U << 9); + out++; + *out = ((*in) >> 14) % (1U << 9); + out++; + *out = ((*in) >> 23); } +void __fastunpack10(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 10); + out++; + *out = ((*in) >> 10) % (1U << 10); + out++; + *out = ((*in) >> 20) % (1U << 10); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + out++; + *out = ((*in) >> 8) % (1U << 10); + out++; + *out = ((*in) >> 18) % (1U << 10); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + out++; + *out = ((*in) >> 6) % (1U << 10); + out++; + *out = ((*in) >> 16) % (1U << 10); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + out++; + *out = ((*in) >> 4) % (1U << 10); + out++; + *out = ((*in) >> 14) % (1U << 10); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + out++; + *out = ((*in) >> 2) % (1U << 10); + out++; + *out = ((*in) >> 12) % (1U << 10); + out++; + *out = ((*in) >> 22); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 10); + out++; + *out = ((*in) >> 10) % (1U << 10); + out++; + *out = ((*in) >> 20) % (1U << 10); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + out++; + *out = ((*in) >> 8) % (1U << 10); + out++; + *out = ((*in) >> 18) % (1U << 10); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + out++; + *out = ((*in) >> 6) % (1U << 10); + out++; + *out = ((*in) >> 16) % (1U << 10); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + out++; + *out = ((*in) >> 4) % (1U << 10); + out++; + *out = ((*in) >> 14) % (1U << 10); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + out++; + *out = ((*in) >> 2) % (1U << 10); + out++; + *out = ((*in) >> 12) % (1U << 10); + out++; + *out = ((*in) >> 22); +} +void __fastunpack11(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 11); + out++; + *out = ((*in) >> 11) % (1U << 11); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 1)) << (11 - 1); + out++; + *out = ((*in) >> 1) % (1U << 11); + out++; + *out = ((*in) >> 12) % (1U << 11); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 2)) << (11 - 2); + out++; + *out = ((*in) >> 2) % (1U << 11); + out++; + *out = ((*in) >> 13) % (1U << 11); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 3)) << (11 - 3); + out++; + *out = ((*in) >> 3) % (1U << 11); + out++; + *out = ((*in) >> 14) % (1U << 11); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 4)) << (11 - 4); + out++; + *out = ((*in) >> 4) % (1U << 11); + out++; + *out = ((*in) >> 15) % (1U << 11); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 5)) << (11 - 5); + out++; + *out = ((*in) >> 5) % (1U << 11); + out++; + *out = ((*in) >> 16) % (1U << 11); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 6)) << (11 - 6); + out++; + *out = ((*in) >> 6) % (1U << 11); + out++; + *out = ((*in) >> 17) % (1U << 11); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 7)) << (11 - 7); + out++; + *out = ((*in) >> 7) % (1U << 11); + out++; + *out = ((*in) >> 18) % (1U << 11); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 8)) << (11 - 8); + out++; + *out = ((*in) >> 8) % (1U << 11); + out++; + *out = ((*in) >> 19) % (1U << 11); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 9)) << (11 - 9); + out++; + *out = ((*in) >> 9) % (1U << 11); + out++; + *out = ((*in) >> 20) % (1U << 11); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 10)) << (11 - 10); + out++; + *out = ((*in) >> 10) % (1U << 11); + out++; + *out = ((*in) >> 21); +} -void __fastunpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) & 1 ; - out++; - *out = ((*in) >> 1) & 1 ; - out++; - *out = ((*in) >> 2) & 1 ; - out++; - *out = ((*in) >> 3) & 1 ; - out++; - *out = ((*in) >> 4) & 1 ; - out++; - *out = ((*in) >> 5) & 1 ; - out++; - *out = ((*in) >> 6) & 1 ; - out++; - *out = ((*in) >> 7) & 1 ; - out++; - *out = ((*in) >> 8) & 1 ; - out++; - *out = ((*in) >> 9) & 1 ; - out++; - *out = ((*in) >> 10) & 1 ; - out++; - *out = ((*in) >> 11) & 1 ; - out++; - *out = ((*in) >> 12) & 1 ; - out++; - *out = ((*in) >> 13) & 1 ; - out++; - *out = ((*in) >> 14) & 1 ; - out++; - *out = ((*in) >> 15) & 1 ; - out++; - *out = ((*in) >> 16) & 1 ; - out++; - *out = ((*in) >> 17) & 1 ; - out++; - *out = ((*in) >> 18) & 1 ; - out++; - *out = ((*in) >> 19) & 1 ; - out++; - *out = ((*in) >> 20) & 1 ; - out++; - *out = ((*in) >> 21) & 1 ; - out++; - *out = ((*in) >> 22) & 1 ; - out++; - *out = ((*in) >> 23) & 1 ; - out++; - *out = ((*in) >> 24) & 1 ; - out++; - *out = ((*in) >> 25) & 1 ; - out++; - *out = ((*in) >> 26) & 1 ; - out++; - *out = ((*in) >> 27) & 1 ; - out++; - *out = ((*in) >> 28) & 1 ; - out++; - *out = ((*in) >> 29) & 1 ; - out++; - *out = ((*in) >> 30) & 1 ; - out++; - *out = ((*in) >> 31) ; +void __fastunpack12(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); } +void __fastunpack13(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 13); + out++; + *out = ((*in) >> 13) % (1U << 13); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 7)) << (13 - 7); + out++; + *out = ((*in) >> 7) % (1U << 13); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 1)) << (13 - 1); + out++; + *out = ((*in) >> 1) % (1U << 13); + out++; + *out = ((*in) >> 14) % (1U << 13); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 8)) << (13 - 8); + out++; + *out = ((*in) >> 8) % (1U << 13); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 2)) << (13 - 2); + out++; + *out = ((*in) >> 2) % (1U << 13); + out++; + *out = ((*in) >> 15) % (1U << 13); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 9)) << (13 - 9); + out++; + *out = ((*in) >> 9) % (1U << 13); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 3)) << (13 - 3); + out++; + *out = ((*in) >> 3) % (1U << 13); + out++; + *out = ((*in) >> 16) % (1U << 13); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 10)) << (13 - 10); + out++; + *out = ((*in) >> 10) % (1U << 13); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 4)) << (13 - 4); + out++; + *out = ((*in) >> 4) % (1U << 13); + out++; + *out = ((*in) >> 17) % (1U << 13); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 11)) << (13 - 11); + out++; + *out = ((*in) >> 11) % (1U << 13); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 5)) << (13 - 5); + out++; + *out = ((*in) >> 5) % (1U << 13); + out++; + *out = ((*in) >> 18) % (1U << 13); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 12)) << (13 - 12); + out++; + *out = ((*in) >> 12) % (1U << 13); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 6)) << (13 - 6); + out++; + *out = ((*in) >> 6) % (1U << 13); + out++; + *out = ((*in) >> 19); +} +void __fastunpack14(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 14); + out++; + *out = ((*in) >> 14) % (1U << 14); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + out++; + *out = ((*in) >> 10) % (1U << 14); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + out++; + *out = ((*in) >> 6) % (1U << 14); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + out++; + *out = ((*in) >> 2) % (1U << 14); + out++; + *out = ((*in) >> 16) % (1U << 14); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + out++; + *out = ((*in) >> 12) % (1U << 14); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + out++; + *out = ((*in) >> 8) % (1U << 14); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + out++; + *out = ((*in) >> 4) % (1U << 14); + out++; + *out = ((*in) >> 18); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 14); + out++; + *out = ((*in) >> 14) % (1U << 14); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + out++; + *out = ((*in) >> 10) % (1U << 14); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + out++; + *out = ((*in) >> 6) % (1U << 14); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + out++; + *out = ((*in) >> 2) % (1U << 14); + out++; + *out = ((*in) >> 16) % (1U << 14); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + out++; + *out = ((*in) >> 12) % (1U << 14); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + out++; + *out = ((*in) >> 8) % (1U << 14); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + out++; + *out = ((*in) >> 4) % (1U << 14); + out++; + *out = ((*in) >> 18); +} +void __fastunpack15(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 15); + out++; + *out = ((*in) >> 15) % (1U << 15); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 13)) << (15 - 13); + out++; + *out = ((*in) >> 13) % (1U << 15); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 11)) << (15 - 11); + out++; + *out = ((*in) >> 11) % (1U << 15); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 9)) << (15 - 9); + out++; + *out = ((*in) >> 9) % (1U << 15); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 7)) << (15 - 7); + out++; + *out = ((*in) >> 7) % (1U << 15); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 5)) << (15 - 5); + out++; + *out = ((*in) >> 5) % (1U << 15); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 3)) << (15 - 3); + out++; + *out = ((*in) >> 3) % (1U << 15); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 1)) << (15 - 1); + out++; + *out = ((*in) >> 1) % (1U << 15); + out++; + *out = ((*in) >> 16) % (1U << 15); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 14)) << (15 - 14); + out++; + *out = ((*in) >> 14) % (1U << 15); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 12)) << (15 - 12); + out++; + *out = ((*in) >> 12) % (1U << 15); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 10)) << (15 - 10); + out++; + *out = ((*in) >> 10) % (1U << 15); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 8)) << (15 - 8); + out++; + *out = ((*in) >> 8) % (1U << 15); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 6)) << (15 - 6); + out++; + *out = ((*in) >> 6) % (1U << 15); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 4)) << (15 - 4); + out++; + *out = ((*in) >> 4) % (1U << 15); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 2)) << (15 - 2); + out++; + *out = ((*in) >> 2) % (1U << 15); + out++; + *out = ((*in) >> 17); +} -void __fastunpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 2) ; - out++; - *out = ((*in) >> 2) % (1U << 2) ; - out++; - *out = ((*in) >> 4) % (1U << 2) ; - out++; - *out = ((*in) >> 6) % (1U << 2) ; - out++; - *out = ((*in) >> 8) % (1U << 2) ; - out++; - *out = ((*in) >> 10) % (1U << 2) ; - out++; - *out = ((*in) >> 12) % (1U << 2) ; - out++; - *out = ((*in) >> 14) % (1U << 2) ; - out++; - *out = ((*in) >> 16) % (1U << 2) ; - out++; - *out = ((*in) >> 18) % (1U << 2) ; - out++; - *out = ((*in) >> 20) % (1U << 2) ; - out++; - *out = ((*in) >> 22) % (1U << 2) ; - out++; - *out = ((*in) >> 24) % (1U << 2) ; - out++; - *out = ((*in) >> 26) % (1U << 2) ; - out++; - *out = ((*in) >> 28) % (1U << 2) ; - out++; - *out = ((*in) >> 30) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 2) ; - out++; - *out = ((*in) >> 2) % (1U << 2) ; - out++; - *out = ((*in) >> 4) % (1U << 2) ; - out++; - *out = ((*in) >> 6) % (1U << 2) ; - out++; - *out = ((*in) >> 8) % (1U << 2) ; - out++; - *out = ((*in) >> 10) % (1U << 2) ; - out++; - *out = ((*in) >> 12) % (1U << 2) ; - out++; - *out = ((*in) >> 14) % (1U << 2) ; - out++; - *out = ((*in) >> 16) % (1U << 2) ; - out++; - *out = ((*in) >> 18) % (1U << 2) ; - out++; - *out = ((*in) >> 20) % (1U << 2) ; - out++; - *out = ((*in) >> 22) % (1U << 2) ; - out++; - *out = ((*in) >> 24) % (1U << 2) ; - out++; - *out = ((*in) >> 26) % (1U << 2) ; - out++; - *out = ((*in) >> 28) % (1U << 2) ; - out++; - *out = ((*in) >> 30) ; +void __fastunpack17(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 2)) << (17 - 2); + out++; + *out = ((*in) >> 2) % (1U << 17); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 4)) << (17 - 4); + out++; + *out = ((*in) >> 4) % (1U << 17); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 6)) << (17 - 6); + out++; + *out = ((*in) >> 6) % (1U << 17); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 8)) << (17 - 8); + out++; + *out = ((*in) >> 8) % (1U << 17); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 10)) << (17 - 10); + out++; + *out = ((*in) >> 10) % (1U << 17); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 12)) << (17 - 12); + out++; + *out = ((*in) >> 12) % (1U << 17); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 14)) << (17 - 14); + out++; + *out = ((*in) >> 14) % (1U << 17); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 16)) << (17 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 1)) << (17 - 1); + out++; + *out = ((*in) >> 1) % (1U << 17); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 3)) << (17 - 3); + out++; + *out = ((*in) >> 3) % (1U << 17); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 5)) << (17 - 5); + out++; + *out = ((*in) >> 5) % (1U << 17); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 7)) << (17 - 7); + out++; + *out = ((*in) >> 7) % (1U << 17); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 9)) << (17 - 9); + out++; + *out = ((*in) >> 9) % (1U << 17); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 11)) << (17 - 11); + out++; + *out = ((*in) >> 11) % (1U << 17); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 13)) << (17 - 13); + out++; + *out = ((*in) >> 13) % (1U << 17); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 15)) << (17 - 15); + out++; + *out = ((*in) >> 15); } +void __fastunpack18(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + out++; + *out = ((*in) >> 4) % (1U << 18); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + out++; + *out = ((*in) >> 8) % (1U << 18); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + out++; + *out = ((*in) >> 12) % (1U << 18); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + out++; + *out = ((*in) >> 2) % (1U << 18); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + out++; + *out = ((*in) >> 6) % (1U << 18); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + out++; + *out = ((*in) >> 10) % (1U << 18); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + out++; + *out = ((*in) >> 14); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + out++; + *out = ((*in) >> 4) % (1U << 18); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + out++; + *out = ((*in) >> 8) % (1U << 18); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + out++; + *out = ((*in) >> 12) % (1U << 18); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + out++; + *out = ((*in) >> 2) % (1U << 18); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + out++; + *out = ((*in) >> 6) % (1U << 18); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + out++; + *out = ((*in) >> 10) % (1U << 18); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + out++; + *out = ((*in) >> 14); +} +void __fastunpack19(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 6)) << (19 - 6); + out++; + *out = ((*in) >> 6) % (1U << 19); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 12)) << (19 - 12); + out++; + *out = ((*in) >> 12) % (1U << 19); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 18)) << (19 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 5)) << (19 - 5); + out++; + *out = ((*in) >> 5) % (1U << 19); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 11)) << (19 - 11); + out++; + *out = ((*in) >> 11) % (1U << 19); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 17)) << (19 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 4)) << (19 - 4); + out++; + *out = ((*in) >> 4) % (1U << 19); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 10)) << (19 - 10); + out++; + *out = ((*in) >> 10) % (1U << 19); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 16)) << (19 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 3)) << (19 - 3); + out++; + *out = ((*in) >> 3) % (1U << 19); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 9)) << (19 - 9); + out++; + *out = ((*in) >> 9) % (1U << 19); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 15)) << (19 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 2)) << (19 - 2); + out++; + *out = ((*in) >> 2) % (1U << 19); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 8)) << (19 - 8); + out++; + *out = ((*in) >> 8) % (1U << 19); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 14)) << (19 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 1)) << (19 - 1); + out++; + *out = ((*in) >> 1) % (1U << 19); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 7)) << (19 - 7); + out++; + *out = ((*in) >> 7) % (1U << 19); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 13)) << (19 - 13); + out++; + *out = ((*in) >> 13); +} +void __fastunpack20(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); +} -void __fastunpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 3) ; - out++; - *out = ((*in) >> 3) % (1U << 3) ; - out++; - *out = ((*in) >> 6) % (1U << 3) ; - out++; - *out = ((*in) >> 9) % (1U << 3) ; - out++; - *out = ((*in) >> 12) % (1U << 3) ; - out++; - *out = ((*in) >> 15) % (1U << 3) ; - out++; - *out = ((*in) >> 18) % (1U << 3) ; - out++; - *out = ((*in) >> 21) % (1U << 3) ; - out++; - *out = ((*in) >> 24) % (1U << 3) ; - out++; - *out = ((*in) >> 27) % (1U << 3) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 1)) << (3 - 1); - out++; - *out = ((*in) >> 1) % (1U << 3) ; - out++; - *out = ((*in) >> 4) % (1U << 3) ; - out++; - *out = ((*in) >> 7) % (1U << 3) ; - out++; - *out = ((*in) >> 10) % (1U << 3) ; - out++; - *out = ((*in) >> 13) % (1U << 3) ; - out++; - *out = ((*in) >> 16) % (1U << 3) ; - out++; - *out = ((*in) >> 19) % (1U << 3) ; - out++; - *out = ((*in) >> 22) % (1U << 3) ; - out++; - *out = ((*in) >> 25) % (1U << 3) ; - out++; - *out = ((*in) >> 28) % (1U << 3) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 2)) << (3 - 2); - out++; - *out = ((*in) >> 2) % (1U << 3) ; - out++; - *out = ((*in) >> 5) % (1U << 3) ; - out++; - *out = ((*in) >> 8) % (1U << 3) ; - out++; - *out = ((*in) >> 11) % (1U << 3) ; - out++; - *out = ((*in) >> 14) % (1U << 3) ; - out++; - *out = ((*in) >> 17) % (1U << 3) ; - out++; - *out = ((*in) >> 20) % (1U << 3) ; - out++; - *out = ((*in) >> 23) % (1U << 3) ; - out++; - *out = ((*in) >> 26) % (1U << 3) ; - out++; - *out = ((*in) >> 29) ; +void __fastunpack21(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 10)) << (21 - 10); + out++; + *out = ((*in) >> 10) % (1U << 21); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 20)) << (21 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 9)) << (21 - 9); + out++; + *out = ((*in) >> 9) % (1U << 21); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 19)) << (21 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 8)) << (21 - 8); + out++; + *out = ((*in) >> 8) % (1U << 21); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 18)) << (21 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 7)) << (21 - 7); + out++; + *out = ((*in) >> 7) % (1U << 21); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 17)) << (21 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 6)) << (21 - 6); + out++; + *out = ((*in) >> 6) % (1U << 21); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 16)) << (21 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 5)) << (21 - 5); + out++; + *out = ((*in) >> 5) % (1U << 21); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 15)) << (21 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 4)) << (21 - 4); + out++; + *out = ((*in) >> 4) % (1U << 21); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 14)) << (21 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 3)) << (21 - 3); + out++; + *out = ((*in) >> 3) % (1U << 21); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 13)) << (21 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 2)) << (21 - 2); + out++; + *out = ((*in) >> 2) % (1U << 21); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 12)) << (21 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 1)) << (21 - 1); + out++; + *out = ((*in) >> 1) % (1U << 21); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 11)) << (21 - 11); + out++; + *out = ((*in) >> 11); } +void __fastunpack22(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + out++; + *out = ((*in) >> 2) % (1U << 22); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + out++; + *out = ((*in) >> 4) % (1U << 22); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + out++; + *out = ((*in) >> 6) % (1U << 22); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + out++; + *out = ((*in) >> 8) % (1U << 22); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + out++; + *out = ((*in) >> 10); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + out++; + *out = ((*in) >> 2) % (1U << 22); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + out++; + *out = ((*in) >> 4) % (1U << 22); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + out++; + *out = ((*in) >> 6) % (1U << 22); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + out++; + *out = ((*in) >> 8) % (1U << 22); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + out++; + *out = ((*in) >> 10); +} +void __fastunpack23(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 14)) << (23 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 5)) << (23 - 5); + out++; + *out = ((*in) >> 5) % (1U << 23); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 19)) << (23 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 10)) << (23 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 1)) << (23 - 1); + out++; + *out = ((*in) >> 1) % (1U << 23); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 15)) << (23 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 6)) << (23 - 6); + out++; + *out = ((*in) >> 6) % (1U << 23); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 20)) << (23 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 11)) << (23 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 2)) << (23 - 2); + out++; + *out = ((*in) >> 2) % (1U << 23); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 16)) << (23 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 7)) << (23 - 7); + out++; + *out = ((*in) >> 7) % (1U << 23); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 21)) << (23 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 12)) << (23 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 3)) << (23 - 3); + out++; + *out = ((*in) >> 3) % (1U << 23); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 17)) << (23 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 8)) << (23 - 8); + out++; + *out = ((*in) >> 8) % (1U << 23); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 22)) << (23 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 13)) << (23 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 4)) << (23 - 4); + out++; + *out = ((*in) >> 4) % (1U << 23); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 18)) << (23 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 9)) << (23 - 9); + out++; + *out = ((*in) >> 9); +} +void __fastunpack24(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); +} -void __fastunpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 5) ; - out++; - *out = ((*in) >> 5) % (1U << 5) ; - out++; - *out = ((*in) >> 10) % (1U << 5) ; - out++; - *out = ((*in) >> 15) % (1U << 5) ; - out++; - *out = ((*in) >> 20) % (1U << 5) ; - out++; - *out = ((*in) >> 25) % (1U << 5) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 3)) << (5 - 3); - out++; - *out = ((*in) >> 3) % (1U << 5) ; - out++; - *out = ((*in) >> 8) % (1U << 5) ; - out++; - *out = ((*in) >> 13) % (1U << 5) ; - out++; - *out = ((*in) >> 18) % (1U << 5) ; - out++; - *out = ((*in) >> 23) % (1U << 5) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 1)) << (5 - 1); - out++; - *out = ((*in) >> 1) % (1U << 5) ; - out++; - *out = ((*in) >> 6) % (1U << 5) ; - out++; - *out = ((*in) >> 11) % (1U << 5) ; - out++; - *out = ((*in) >> 16) % (1U << 5) ; - out++; - *out = ((*in) >> 21) % (1U << 5) ; - out++; - *out = ((*in) >> 26) % (1U << 5) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 4)) << (5 - 4); - out++; - *out = ((*in) >> 4) % (1U << 5) ; - out++; - *out = ((*in) >> 9) % (1U << 5) ; - out++; - *out = ((*in) >> 14) % (1U << 5) ; - out++; - *out = ((*in) >> 19) % (1U << 5) ; - out++; - *out = ((*in) >> 24) % (1U << 5) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 2)) << (5 - 2); - out++; - *out = ((*in) >> 2) % (1U << 5) ; - out++; - *out = ((*in) >> 7) % (1U << 5) ; - out++; - *out = ((*in) >> 12) % (1U << 5) ; - out++; - *out = ((*in) >> 17) % (1U << 5) ; - out++; - *out = ((*in) >> 22) % (1U << 5) ; - out++; - *out = ((*in) >> 27) ; +void __fastunpack25(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 18)) << (25 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 11)) << (25 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 4)) << (25 - 4); + out++; + *out = ((*in) >> 4) % (1U << 25); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 22)) << (25 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 15)) << (25 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 8)) << (25 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 1)) << (25 - 1); + out++; + *out = ((*in) >> 1) % (1U << 25); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 19)) << (25 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 12)) << (25 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 5)) << (25 - 5); + out++; + *out = ((*in) >> 5) % (1U << 25); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 23)) << (25 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 16)) << (25 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 9)) << (25 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 2)) << (25 - 2); + out++; + *out = ((*in) >> 2) % (1U << 25); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 20)) << (25 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 13)) << (25 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 6)) << (25 - 6); + out++; + *out = ((*in) >> 6) % (1U << 25); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 24)) << (25 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 17)) << (25 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 10)) << (25 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 3)) << (25 - 3); + out++; + *out = ((*in) >> 3) % (1U << 25); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 21)) << (25 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 14)) << (25 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 7)) << (25 - 7); + out++; + *out = ((*in) >> 7); } +void __fastunpack26(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + out++; + *out = ((*in) >> 2) % (1U << 26); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + out++; + *out = ((*in) >> 4) % (1U << 26); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + out++; + *out = ((*in) >> 6); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + out++; + *out = ((*in) >> 2) % (1U << 26); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + out++; + *out = ((*in) >> 4) % (1U << 26); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + out++; + *out = ((*in) >> 6); +} +void __fastunpack27(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 22)) << (27 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 17)) << (27 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 12)) << (27 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 7)) << (27 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 2)) << (27 - 2); + out++; + *out = ((*in) >> 2) % (1U << 27); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 24)) << (27 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 19)) << (27 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 14)) << (27 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 9)) << (27 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 4)) << (27 - 4); + out++; + *out = ((*in) >> 4) % (1U << 27); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 26)) << (27 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 21)) << (27 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 16)) << (27 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 11)) << (27 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 6)) << (27 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 1)) << (27 - 1); + out++; + *out = ((*in) >> 1) % (1U << 27); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 23)) << (27 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 18)) << (27 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 13)) << (27 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 8)) << (27 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 3)) << (27 - 3); + out++; + *out = ((*in) >> 3) % (1U << 27); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 25)) << (27 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 20)) << (27 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 15)) << (27 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 10)) << (27 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 5)) << (27 - 5); + out++; + *out = ((*in) >> 5); +} +void __fastunpack28(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); +} -void __fastunpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 6) ; - out++; - *out = ((*in) >> 6) % (1U << 6) ; - out++; - *out = ((*in) >> 12) % (1U << 6) ; - out++; - *out = ((*in) >> 18) % (1U << 6) ; - out++; - *out = ((*in) >> 24) % (1U << 6) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 4)) << (6 - 4); - out++; - *out = ((*in) >> 4) % (1U << 6) ; - out++; - *out = ((*in) >> 10) % (1U << 6) ; - out++; - *out = ((*in) >> 16) % (1U << 6) ; - out++; - *out = ((*in) >> 22) % (1U << 6) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 2)) << (6 - 2); - out++; - *out = ((*in) >> 2) % (1U << 6) ; - out++; - *out = ((*in) >> 8) % (1U << 6) ; - out++; - *out = ((*in) >> 14) % (1U << 6) ; - out++; - *out = ((*in) >> 20) % (1U << 6) ; - out++; - *out = ((*in) >> 26) ; +void __fastunpack29(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 26)) << (29 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 23)) << (29 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 20)) << (29 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 17)) << (29 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 14)) << (29 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 11)) << (29 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 8)) << (29 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 5)) << (29 - 5); + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 2)) << (29 - 2); + out++; + *out = ((*in) >> 2) % (1U << 29); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 28)) << (29 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 25)) << (29 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 22)) << (29 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 19)) << (29 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 16)) << (29 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 13)) << (29 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 10)) << (29 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 7)) << (29 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 4)) << (29 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 1)) << (29 - 1); + out++; + *out = ((*in) >> 1) % (1U << 29); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 27)) << (29 - 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 24)) << (29 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 21)) << (29 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 18)) << (29 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 15)) << (29 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 12)) << (29 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 9)) << (29 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 6)) << (29 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 3)) << (29 - 3); + out++; + *out = ((*in) >> 3); +} + +void __fastunpack30(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + out++; + *out = ((*in) >> 2); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + out++; + *out = ((*in) >> 2); +} + +void __fastunpack31(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = ((*in) >> 0) % (1U << 31); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 30)) << (31 - 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 29)) << (31 - 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 28)) << (31 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 27)) << (31 - 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 26)) << (31 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 25)) << (31 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 24)) << (31 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 23)) << (31 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 22)) << (31 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 21)) << (31 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 20)) << (31 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 19)) << (31 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 18)) << (31 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 17)) << (31 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 16)) << (31 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 15)) << (31 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 14)) << (31 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 13)) << (31 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 12)) << (31 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 11)) << (31 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 10)) << (31 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 9)) << (31 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 8)) << (31 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 7)) << (31 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 6)) << (31 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 5)) << (31 - 5); + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 4)) << (31 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 3)) << (31 - 3); + out++; + *out = ((*in) >> 3); + ++in; + *out |= ((*in) % (1U << 2)) << (31 - 2); + out++; + *out = ((*in) >> 2); + ++in; + *out |= ((*in) % (1U << 1)) << (31 - 1); + out++; + *out = ((*in) >> 1); +} + +void __fastunpack4(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 4; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); ++in; - out++; - *out = ((*in) >> 0) % (1U << 6) ; - out++; - *out = ((*in) >> 6) % (1U << 6) ; - out++; - *out = ((*in) >> 12) % (1U << 6) ; - out++; - *out = ((*in) >> 18) % (1U << 6) ; - out++; - *out = ((*in) >> 24) % (1U << 6) ; - out++; - *out = ((*in) >> 30) ; + } +} + +void __fastunpack8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 8; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) + *(out++) = ((*in) >> inwordpointer) % (1U << 8); ++in; - *out |= ((*in) % (1U << 4)) << (6 - 4); - out++; - *out = ((*in) >> 4) % (1U << 6) ; - out++; - *out = ((*in) >> 10) % (1U << 6) ; - out++; - *out = ((*in) >> 16) % (1U << 6) ; - out++; - *out = ((*in) >> 22) % (1U << 6) ; - out++; - *out = ((*in) >> 28) ; + } +} + +void __fastunpack16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 16; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 16) + *(out++) = ((*in) >> inwordpointer) % (1U << 16); ++in; - *out |= ((*in) % (1U << 2)) << (6 - 2); - out++; - *out = ((*in) >> 2) % (1U << 6) ; - out++; - *out = ((*in) >> 8) % (1U << 6) ; - out++; - *out = ((*in) >> 14) % (1U << 6) ; - out++; - *out = ((*in) >> 20) % (1U << 6) ; - out++; - *out = ((*in) >> 26) ; + } } +void __fastpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) & 1; + ++in; + *out |= ((*in) & 1) << 1; + ++in; + *out |= ((*in) & 1) << 2; + ++in; + *out |= ((*in) & 1) << 3; + ++in; + *out |= ((*in) & 1) << 4; + ++in; + *out |= ((*in) & 1) << 5; + ++in; + *out |= ((*in) & 1) << 6; + ++in; + *out |= ((*in) & 1) << 7; + ++in; + *out |= ((*in) & 1) << 8; + ++in; + *out |= ((*in) & 1) << 9; + ++in; + *out |= ((*in) & 1) << 10; + ++in; + *out |= ((*in) & 1) << 11; + ++in; + *out |= ((*in) & 1) << 12; + ++in; + *out |= ((*in) & 1) << 13; + ++in; + *out |= ((*in) & 1) << 14; + ++in; + *out |= ((*in) & 1) << 15; + ++in; + *out |= ((*in) & 1) << 16; + ++in; + *out |= ((*in) & 1) << 17; + ++in; + *out |= ((*in) & 1) << 18; + ++in; + *out |= ((*in) & 1) << 19; + ++in; + *out |= ((*in) & 1) << 20; + ++in; + *out |= ((*in) & 1) << 21; + ++in; + *out |= ((*in) & 1) << 22; + ++in; + *out |= ((*in) & 1) << 23; + ++in; + *out |= ((*in) & 1) << 24; + ++in; + *out |= ((*in) & 1) << 25; + ++in; + *out |= ((*in) & 1) << 26; + ++in; + *out |= ((*in) & 1) << 27; + ++in; + *out |= ((*in) & 1) << 28; + ++in; + *out |= ((*in) & 1) << 29; + ++in; + *out |= ((*in) & 1) << 30; + ++in; + *out |= ((*in)) << 31; +} +void __fastpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 2); + ++in; + *out |= ((*in) % (1U << 2)) << 2; + ++in; + *out |= ((*in) % (1U << 2)) << 4; + ++in; + *out |= ((*in) % (1U << 2)) << 6; + ++in; + *out |= ((*in) % (1U << 2)) << 8; + ++in; + *out |= ((*in) % (1U << 2)) << 10; + ++in; + *out |= ((*in) % (1U << 2)) << 12; + ++in; + *out |= ((*in) % (1U << 2)) << 14; + ++in; + *out |= ((*in) % (1U << 2)) << 16; + ++in; + *out |= ((*in) % (1U << 2)) << 18; + ++in; + *out |= ((*in) % (1U << 2)) << 20; + ++in; + *out |= ((*in) % (1U << 2)) << 22; + ++in; + *out |= ((*in) % (1U << 2)) << 24; + ++in; + *out |= ((*in) % (1U << 2)) << 26; + ++in; + *out |= ((*in) % (1U << 2)) << 28; + ++in; + *out |= ((*in)) << 30; + ++out; + ++in; + *out = (*in) % (1U << 2); + ++in; + *out |= ((*in) % (1U << 2)) << 2; + ++in; + *out |= ((*in) % (1U << 2)) << 4; + ++in; + *out |= ((*in) % (1U << 2)) << 6; + ++in; + *out |= ((*in) % (1U << 2)) << 8; + ++in; + *out |= ((*in) % (1U << 2)) << 10; + ++in; + *out |= ((*in) % (1U << 2)) << 12; + ++in; + *out |= ((*in) % (1U << 2)) << 14; + ++in; + *out |= ((*in) % (1U << 2)) << 16; + ++in; + *out |= ((*in) % (1U << 2)) << 18; + ++in; + *out |= ((*in) % (1U << 2)) << 20; + ++in; + *out |= ((*in) % (1U << 2)) << 22; + ++in; + *out |= ((*in) % (1U << 2)) << 24; + ++in; + *out |= ((*in) % (1U << 2)) << 26; + ++in; + *out |= ((*in) % (1U << 2)) << 28; + ++in; + *out |= ((*in)) << 30; +} +void __fastpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 3); + ++in; + *out |= ((*in) % (1U << 3)) << 3; + ++in; + *out |= ((*in) % (1U << 3)) << 6; + ++in; + *out |= ((*in) % (1U << 3)) << 9; + ++in; + *out |= ((*in) % (1U << 3)) << 12; + ++in; + *out |= ((*in) % (1U << 3)) << 15; + ++in; + *out |= ((*in) % (1U << 3)) << 18; + ++in; + *out |= ((*in) % (1U << 3)) << 21; + ++in; + *out |= ((*in) % (1U << 3)) << 24; + ++in; + *out |= ((*in) % (1U << 3)) << 27; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 3)) >> (3 - 1); + ++in; + *out |= ((*in) % (1U << 3)) << 1; + ++in; + *out |= ((*in) % (1U << 3)) << 4; + ++in; + *out |= ((*in) % (1U << 3)) << 7; + ++in; + *out |= ((*in) % (1U << 3)) << 10; + ++in; + *out |= ((*in) % (1U << 3)) << 13; + ++in; + *out |= ((*in) % (1U << 3)) << 16; + ++in; + *out |= ((*in) % (1U << 3)) << 19; + ++in; + *out |= ((*in) % (1U << 3)) << 22; + ++in; + *out |= ((*in) % (1U << 3)) << 25; + ++in; + *out |= ((*in) % (1U << 3)) << 28; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 3)) >> (3 - 2); + ++in; + *out |= ((*in) % (1U << 3)) << 2; + ++in; + *out |= ((*in) % (1U << 3)) << 5; + ++in; + *out |= ((*in) % (1U << 3)) << 8; + ++in; + *out |= ((*in) % (1U << 3)) << 11; + ++in; + *out |= ((*in) % (1U << 3)) << 14; + ++in; + *out |= ((*in) % (1U << 3)) << 17; + ++in; + *out |= ((*in) % (1U << 3)) << 20; + ++in; + *out |= ((*in) % (1U << 3)) << 23; + ++in; + *out |= ((*in) % (1U << 3)) << 26; + ++in; + *out |= ((*in)) << 29; +} -void __fastunpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 7) ; - out++; - *out = ((*in) >> 7) % (1U << 7) ; - out++; - *out = ((*in) >> 14) % (1U << 7) ; - out++; - *out = ((*in) >> 21) % (1U << 7) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 3)) << (7 - 3); - out++; - *out = ((*in) >> 3) % (1U << 7) ; - out++; - *out = ((*in) >> 10) % (1U << 7) ; - out++; - *out = ((*in) >> 17) % (1U << 7) ; - out++; - *out = ((*in) >> 24) % (1U << 7) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 6)) << (7 - 6); - out++; - *out = ((*in) >> 6) % (1U << 7) ; - out++; - *out = ((*in) >> 13) % (1U << 7) ; - out++; - *out = ((*in) >> 20) % (1U << 7) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 2)) << (7 - 2); - out++; - *out = ((*in) >> 2) % (1U << 7) ; - out++; - *out = ((*in) >> 9) % (1U << 7) ; - out++; - *out = ((*in) >> 16) % (1U << 7) ; - out++; - *out = ((*in) >> 23) % (1U << 7) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 5)) << (7 - 5); - out++; - *out = ((*in) >> 5) % (1U << 7) ; - out++; - *out = ((*in) >> 12) % (1U << 7) ; - out++; - *out = ((*in) >> 19) % (1U << 7) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 1)) << (7 - 1); - out++; - *out = ((*in) >> 1) % (1U << 7) ; - out++; - *out = ((*in) >> 8) % (1U << 7) ; - out++; - *out = ((*in) >> 15) % (1U << 7) ; - out++; - *out = ((*in) >> 22) % (1U << 7) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 4)) << (7 - 4); - out++; - *out = ((*in) >> 4) % (1U << 7) ; - out++; - *out = ((*in) >> 11) % (1U << 7) ; - out++; - *out = ((*in) >> 18) % (1U << 7) ; - out++; - *out = ((*in) >> 25) ; +void __fastpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 4); + ++in; + *out |= ((*in) % (1U << 4)) << 4; + ++in; + *out |= ((*in) % (1U << 4)) << 8; + ++in; + *out |= ((*in) % (1U << 4)) << 12; + ++in; + *out |= ((*in) % (1U << 4)) << 16; + ++in; + *out |= ((*in) % (1U << 4)) << 20; + ++in; + *out |= ((*in) % (1U << 4)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + *out = (*in) % (1U << 4); + ++in; + *out |= ((*in) % (1U << 4)) << 4; + ++in; + *out |= ((*in) % (1U << 4)) << 8; + ++in; + *out |= ((*in) % (1U << 4)) << 12; + ++in; + *out |= ((*in) % (1U << 4)) << 16; + ++in; + *out |= ((*in) % (1U << 4)) << 20; + ++in; + *out |= ((*in) % (1U << 4)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + *out = (*in) % (1U << 4); + ++in; + *out |= ((*in) % (1U << 4)) << 4; + ++in; + *out |= ((*in) % (1U << 4)) << 8; + ++in; + *out |= ((*in) % (1U << 4)) << 12; + ++in; + *out |= ((*in) % (1U << 4)) << 16; + ++in; + *out |= ((*in) % (1U << 4)) << 20; + ++in; + *out |= ((*in) % (1U << 4)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + *out = (*in) % (1U << 4); + ++in; + *out |= ((*in) % (1U << 4)) << 4; + ++in; + *out |= ((*in) % (1U << 4)) << 8; + ++in; + *out |= ((*in) % (1U << 4)) << 12; + ++in; + *out |= ((*in) % (1U << 4)) << 16; + ++in; + *out |= ((*in) % (1U << 4)) << 20; + ++in; + *out |= ((*in) % (1U << 4)) << 24; + ++in; + *out |= ((*in)) << 28; } +void __fastpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 5); + ++in; + *out |= ((*in) % (1U << 5)) << 5; + ++in; + *out |= ((*in) % (1U << 5)) << 10; + ++in; + *out |= ((*in) % (1U << 5)) << 15; + ++in; + *out |= ((*in) % (1U << 5)) << 20; + ++in; + *out |= ((*in) % (1U << 5)) << 25; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 5)) >> (5 - 3); + ++in; + *out |= ((*in) % (1U << 5)) << 3; + ++in; + *out |= ((*in) % (1U << 5)) << 8; + ++in; + *out |= ((*in) % (1U << 5)) << 13; + ++in; + *out |= ((*in) % (1U << 5)) << 18; + ++in; + *out |= ((*in) % (1U << 5)) << 23; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 5)) >> (5 - 1); + ++in; + *out |= ((*in) % (1U << 5)) << 1; + ++in; + *out |= ((*in) % (1U << 5)) << 6; + ++in; + *out |= ((*in) % (1U << 5)) << 11; + ++in; + *out |= ((*in) % (1U << 5)) << 16; + ++in; + *out |= ((*in) % (1U << 5)) << 21; + ++in; + *out |= ((*in) % (1U << 5)) << 26; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 5)) >> (5 - 4); + ++in; + *out |= ((*in) % (1U << 5)) << 4; + ++in; + *out |= ((*in) % (1U << 5)) << 9; + ++in; + *out |= ((*in) % (1U << 5)) << 14; + ++in; + *out |= ((*in) % (1U << 5)) << 19; + ++in; + *out |= ((*in) % (1U << 5)) << 24; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 5)) >> (5 - 2); + ++in; + *out |= ((*in) % (1U << 5)) << 2; + ++in; + *out |= ((*in) % (1U << 5)) << 7; + ++in; + *out |= ((*in) % (1U << 5)) << 12; + ++in; + *out |= ((*in) % (1U << 5)) << 17; + ++in; + *out |= ((*in) % (1U << 5)) << 22; + ++in; + *out |= ((*in)) << 27; +} +void __fastpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 6); + ++in; + *out |= ((*in) % (1U << 6)) << 6; + ++in; + *out |= ((*in) % (1U << 6)) << 12; + ++in; + *out |= ((*in) % (1U << 6)) << 18; + ++in; + *out |= ((*in) % (1U << 6)) << 24; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 6)) >> (6 - 4); + ++in; + *out |= ((*in) % (1U << 6)) << 4; + ++in; + *out |= ((*in) % (1U << 6)) << 10; + ++in; + *out |= ((*in) % (1U << 6)) << 16; + ++in; + *out |= ((*in) % (1U << 6)) << 22; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 6)) >> (6 - 2); + ++in; + *out |= ((*in) % (1U << 6)) << 2; + ++in; + *out |= ((*in) % (1U << 6)) << 8; + ++in; + *out |= ((*in) % (1U << 6)) << 14; + ++in; + *out |= ((*in) % (1U << 6)) << 20; + ++in; + *out |= ((*in)) << 26; + ++out; + ++in; + *out = (*in) % (1U << 6); + ++in; + *out |= ((*in) % (1U << 6)) << 6; + ++in; + *out |= ((*in) % (1U << 6)) << 12; + ++in; + *out |= ((*in) % (1U << 6)) << 18; + ++in; + *out |= ((*in) % (1U << 6)) << 24; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 6)) >> (6 - 4); + ++in; + *out |= ((*in) % (1U << 6)) << 4; + ++in; + *out |= ((*in) % (1U << 6)) << 10; + ++in; + *out |= ((*in) % (1U << 6)) << 16; + ++in; + *out |= ((*in) % (1U << 6)) << 22; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 6)) >> (6 - 2); + ++in; + *out |= ((*in) % (1U << 6)) << 2; + ++in; + *out |= ((*in) % (1U << 6)) << 8; + ++in; + *out |= ((*in) % (1U << 6)) << 14; + ++in; + *out |= ((*in) % (1U << 6)) << 20; + ++in; + *out |= ((*in)) << 26; +} +void __fastpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 7); + ++in; + *out |= ((*in) % (1U << 7)) << 7; + ++in; + *out |= ((*in) % (1U << 7)) << 14; + ++in; + *out |= ((*in) % (1U << 7)) << 21; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 7)) >> (7 - 3); + ++in; + *out |= ((*in) % (1U << 7)) << 3; + ++in; + *out |= ((*in) % (1U << 7)) << 10; + ++in; + *out |= ((*in) % (1U << 7)) << 17; + ++in; + *out |= ((*in) % (1U << 7)) << 24; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 7)) >> (7 - 6); + ++in; + *out |= ((*in) % (1U << 7)) << 6; + ++in; + *out |= ((*in) % (1U << 7)) << 13; + ++in; + *out |= ((*in) % (1U << 7)) << 20; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 7)) >> (7 - 2); + ++in; + *out |= ((*in) % (1U << 7)) << 2; + ++in; + *out |= ((*in) % (1U << 7)) << 9; + ++in; + *out |= ((*in) % (1U << 7)) << 16; + ++in; + *out |= ((*in) % (1U << 7)) << 23; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 7)) >> (7 - 5); + ++in; + *out |= ((*in) % (1U << 7)) << 5; + ++in; + *out |= ((*in) % (1U << 7)) << 12; + ++in; + *out |= ((*in) % (1U << 7)) << 19; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 7)) >> (7 - 1); + ++in; + *out |= ((*in) % (1U << 7)) << 1; + ++in; + *out |= ((*in) % (1U << 7)) << 8; + ++in; + *out |= ((*in) % (1U << 7)) << 15; + ++in; + *out |= ((*in) % (1U << 7)) << 22; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 7)) >> (7 - 4); + ++in; + *out |= ((*in) % (1U << 7)) << 4; + ++in; + *out |= ((*in) % (1U << 7)) << 11; + ++in; + *out |= ((*in) % (1U << 7)) << 18; + ++in; + *out |= ((*in)) << 25; +} -void __fastunpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 9) ; - out++; - *out = ((*in) >> 9) % (1U << 9) ; - out++; - *out = ((*in) >> 18) % (1U << 9) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 4)) << (9 - 4); - out++; - *out = ((*in) >> 4) % (1U << 9) ; - out++; - *out = ((*in) >> 13) % (1U << 9) ; - out++; - *out = ((*in) >> 22) % (1U << 9) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 8)) << (9 - 8); - out++; - *out = ((*in) >> 8) % (1U << 9) ; - out++; - *out = ((*in) >> 17) % (1U << 9) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 3)) << (9 - 3); - out++; - *out = ((*in) >> 3) % (1U << 9) ; - out++; - *out = ((*in) >> 12) % (1U << 9) ; - out++; - *out = ((*in) >> 21) % (1U << 9) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 7)) << (9 - 7); - out++; - *out = ((*in) >> 7) % (1U << 9) ; - out++; - *out = ((*in) >> 16) % (1U << 9) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 2)) << (9 - 2); - out++; - *out = ((*in) >> 2) % (1U << 9) ; - out++; - *out = ((*in) >> 11) % (1U << 9) ; - out++; - *out = ((*in) >> 20) % (1U << 9) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 6)) << (9 - 6); - out++; - *out = ((*in) >> 6) % (1U << 9) ; - out++; - *out = ((*in) >> 15) % (1U << 9) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 1)) << (9 - 1); - out++; - *out = ((*in) >> 1) % (1U << 9) ; - out++; - *out = ((*in) >> 10) % (1U << 9) ; - out++; - *out = ((*in) >> 19) % (1U << 9) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 5)) << (9 - 5); - out++; - *out = ((*in) >> 5) % (1U << 9) ; - out++; - *out = ((*in) >> 14) % (1U << 9) ; - out++; - *out = ((*in) >> 23) ; +void __fastpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 8); + ++in; + *out |= ((*in) % (1U << 8)) << 8; + ++in; + *out |= ((*in) % (1U << 8)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in) % (1U << 8); + ++in; + *out |= ((*in) % (1U << 8)) << 8; + ++in; + *out |= ((*in) % (1U << 8)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in) % (1U << 8); + ++in; + *out |= ((*in) % (1U << 8)) << 8; + ++in; + *out |= ((*in) % (1U << 8)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in) % (1U << 8); + ++in; + *out |= ((*in) % (1U << 8)) << 8; + ++in; + *out |= ((*in) % (1U << 8)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in) % (1U << 8); + ++in; + *out |= ((*in) % (1U << 8)) << 8; + ++in; + *out |= ((*in) % (1U << 8)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in) % (1U << 8); + ++in; + *out |= ((*in) % (1U << 8)) << 8; + ++in; + *out |= ((*in) % (1U << 8)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in) % (1U << 8); + ++in; + *out |= ((*in) % (1U << 8)) << 8; + ++in; + *out |= ((*in) % (1U << 8)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in) % (1U << 8); + ++in; + *out |= ((*in) % (1U << 8)) << 8; + ++in; + *out |= ((*in) % (1U << 8)) << 16; + ++in; + *out |= ((*in)) << 24; } +void __fastpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 9); + ++in; + *out |= ((*in) % (1U << 9)) << 9; + ++in; + *out |= ((*in) % (1U << 9)) << 18; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 9)) >> (9 - 4); + ++in; + *out |= ((*in) % (1U << 9)) << 4; + ++in; + *out |= ((*in) % (1U << 9)) << 13; + ++in; + *out |= ((*in) % (1U << 9)) << 22; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 9)) >> (9 - 8); + ++in; + *out |= ((*in) % (1U << 9)) << 8; + ++in; + *out |= ((*in) % (1U << 9)) << 17; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 9)) >> (9 - 3); + ++in; + *out |= ((*in) % (1U << 9)) << 3; + ++in; + *out |= ((*in) % (1U << 9)) << 12; + ++in; + *out |= ((*in) % (1U << 9)) << 21; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 9)) >> (9 - 7); + ++in; + *out |= ((*in) % (1U << 9)) << 7; + ++in; + *out |= ((*in) % (1U << 9)) << 16; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 9)) >> (9 - 2); + ++in; + *out |= ((*in) % (1U << 9)) << 2; + ++in; + *out |= ((*in) % (1U << 9)) << 11; + ++in; + *out |= ((*in) % (1U << 9)) << 20; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 9)) >> (9 - 6); + ++in; + *out |= ((*in) % (1U << 9)) << 6; + ++in; + *out |= ((*in) % (1U << 9)) << 15; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 9)) >> (9 - 1); + ++in; + *out |= ((*in) % (1U << 9)) << 1; + ++in; + *out |= ((*in) % (1U << 9)) << 10; + ++in; + *out |= ((*in) % (1U << 9)) << 19; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 9)) >> (9 - 5); + ++in; + *out |= ((*in) % (1U << 9)) << 5; + ++in; + *out |= ((*in) % (1U << 9)) << 14; + ++in; + *out |= ((*in)) << 23; +} +void __fastpack10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 10); + ++in; + *out |= ((*in) % (1U << 10)) << 10; + ++in; + *out |= ((*in) % (1U << 10)) << 20; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 10)) >> (10 - 8); + ++in; + *out |= ((*in) % (1U << 10)) << 8; + ++in; + *out |= ((*in) % (1U << 10)) << 18; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 10)) >> (10 - 6); + ++in; + *out |= ((*in) % (1U << 10)) << 6; + ++in; + *out |= ((*in) % (1U << 10)) << 16; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 10)) >> (10 - 4); + ++in; + *out |= ((*in) % (1U << 10)) << 4; + ++in; + *out |= ((*in) % (1U << 10)) << 14; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 10)) >> (10 - 2); + ++in; + *out |= ((*in) % (1U << 10)) << 2; + ++in; + *out |= ((*in) % (1U << 10)) << 12; + ++in; + *out |= ((*in)) << 22; + ++out; + ++in; + *out = (*in) % (1U << 10); + ++in; + *out |= ((*in) % (1U << 10)) << 10; + ++in; + *out |= ((*in) % (1U << 10)) << 20; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 10)) >> (10 - 8); + ++in; + *out |= ((*in) % (1U << 10)) << 8; + ++in; + *out |= ((*in) % (1U << 10)) << 18; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 10)) >> (10 - 6); + ++in; + *out |= ((*in) % (1U << 10)) << 6; + ++in; + *out |= ((*in) % (1U << 10)) << 16; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 10)) >> (10 - 4); + ++in; + *out |= ((*in) % (1U << 10)) << 4; + ++in; + *out |= ((*in) % (1U << 10)) << 14; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 10)) >> (10 - 2); + ++in; + *out |= ((*in) % (1U << 10)) << 2; + ++in; + *out |= ((*in) % (1U << 10)) << 12; + ++in; + *out |= ((*in)) << 22; +} +void __fastpack11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 11); + ++in; + *out |= ((*in) % (1U << 11)) << 11; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 1); + ++in; + *out |= ((*in) % (1U << 11)) << 1; + ++in; + *out |= ((*in) % (1U << 11)) << 12; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 2); + ++in; + *out |= ((*in) % (1U << 11)) << 2; + ++in; + *out |= ((*in) % (1U << 11)) << 13; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 3); + ++in; + *out |= ((*in) % (1U << 11)) << 3; + ++in; + *out |= ((*in) % (1U << 11)) << 14; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 4); + ++in; + *out |= ((*in) % (1U << 11)) << 4; + ++in; + *out |= ((*in) % (1U << 11)) << 15; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 5); + ++in; + *out |= ((*in) % (1U << 11)) << 5; + ++in; + *out |= ((*in) % (1U << 11)) << 16; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 6); + ++in; + *out |= ((*in) % (1U << 11)) << 6; + ++in; + *out |= ((*in) % (1U << 11)) << 17; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 7); + ++in; + *out |= ((*in) % (1U << 11)) << 7; + ++in; + *out |= ((*in) % (1U << 11)) << 18; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 8); + ++in; + *out |= ((*in) % (1U << 11)) << 8; + ++in; + *out |= ((*in) % (1U << 11)) << 19; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 9); + ++in; + *out |= ((*in) % (1U << 11)) << 9; + ++in; + *out |= ((*in) % (1U << 11)) << 20; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 11)) >> (11 - 10); + ++in; + *out |= ((*in) % (1U << 11)) << 10; + ++in; + *out |= ((*in)) << 21; +} -void __fastunpack10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 10) ; - out++; - *out = ((*in) >> 10) % (1U << 10) ; - out++; - *out = ((*in) >> 20) % (1U << 10) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 8)) << (10 - 8); - out++; - *out = ((*in) >> 8) % (1U << 10) ; - out++; - *out = ((*in) >> 18) % (1U << 10) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 6)) << (10 - 6); - out++; - *out = ((*in) >> 6) % (1U << 10) ; - out++; - *out = ((*in) >> 16) % (1U << 10) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 4)) << (10 - 4); - out++; - *out = ((*in) >> 4) % (1U << 10) ; - out++; - *out = ((*in) >> 14) % (1U << 10) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 2)) << (10 - 2); - out++; - *out = ((*in) >> 2) % (1U << 10) ; - out++; - *out = ((*in) >> 12) % (1U << 10) ; - out++; - *out = ((*in) >> 22) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 10) ; - out++; - *out = ((*in) >> 10) % (1U << 10) ; - out++; - *out = ((*in) >> 20) % (1U << 10) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 8)) << (10 - 8); - out++; - *out = ((*in) >> 8) % (1U << 10) ; - out++; - *out = ((*in) >> 18) % (1U << 10) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 6)) << (10 - 6); - out++; - *out = ((*in) >> 6) % (1U << 10) ; - out++; - *out = ((*in) >> 16) % (1U << 10) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 4)) << (10 - 4); - out++; - *out = ((*in) >> 4) % (1U << 10) ; - out++; - *out = ((*in) >> 14) % (1U << 10) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 2)) << (10 - 2); - out++; - *out = ((*in) >> 2) % (1U << 10) ; - out++; - *out = ((*in) >> 12) % (1U << 10) ; - out++; - *out = ((*in) >> 22) ; +void __fastpack12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 12); + ++in; + *out |= ((*in) % (1U << 12)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 12)) >> (12 - 4); + ++in; + *out |= ((*in) % (1U << 12)) << 4; + ++in; + *out |= ((*in) % (1U << 12)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 12)) >> (12 - 8); + ++in; + *out |= ((*in) % (1U << 12)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + *out = (*in) % (1U << 12); + ++in; + *out |= ((*in) % (1U << 12)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 12)) >> (12 - 4); + ++in; + *out |= ((*in) % (1U << 12)) << 4; + ++in; + *out |= ((*in) % (1U << 12)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 12)) >> (12 - 8); + ++in; + *out |= ((*in) % (1U << 12)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + *out = (*in) % (1U << 12); + ++in; + *out |= ((*in) % (1U << 12)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 12)) >> (12 - 4); + ++in; + *out |= ((*in) % (1U << 12)) << 4; + ++in; + *out |= ((*in) % (1U << 12)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 12)) >> (12 - 8); + ++in; + *out |= ((*in) % (1U << 12)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + *out = (*in) % (1U << 12); + ++in; + *out |= ((*in) % (1U << 12)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 12)) >> (12 - 4); + ++in; + *out |= ((*in) % (1U << 12)) << 4; + ++in; + *out |= ((*in) % (1U << 12)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 12)) >> (12 - 8); + ++in; + *out |= ((*in) % (1U << 12)) << 8; + ++in; + *out |= ((*in)) << 20; } +void __fastpack13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 13); + ++in; + *out |= ((*in) % (1U << 13)) << 13; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 7); + ++in; + *out |= ((*in) % (1U << 13)) << 7; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 1); + ++in; + *out |= ((*in) % (1U << 13)) << 1; + ++in; + *out |= ((*in) % (1U << 13)) << 14; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 8); + ++in; + *out |= ((*in) % (1U << 13)) << 8; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 2); + ++in; + *out |= ((*in) % (1U << 13)) << 2; + ++in; + *out |= ((*in) % (1U << 13)) << 15; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 9); + ++in; + *out |= ((*in) % (1U << 13)) << 9; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 3); + ++in; + *out |= ((*in) % (1U << 13)) << 3; + ++in; + *out |= ((*in) % (1U << 13)) << 16; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 10); + ++in; + *out |= ((*in) % (1U << 13)) << 10; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 4); + ++in; + *out |= ((*in) % (1U << 13)) << 4; + ++in; + *out |= ((*in) % (1U << 13)) << 17; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 11); + ++in; + *out |= ((*in) % (1U << 13)) << 11; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 5); + ++in; + *out |= ((*in) % (1U << 13)) << 5; + ++in; + *out |= ((*in) % (1U << 13)) << 18; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 12); + ++in; + *out |= ((*in) % (1U << 13)) << 12; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 13)) >> (13 - 6); + ++in; + *out |= ((*in) % (1U << 13)) << 6; + ++in; + *out |= ((*in)) << 19; +} +void __fastpack14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 14); + ++in; + *out |= ((*in) % (1U << 14)) << 14; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 10); + ++in; + *out |= ((*in) % (1U << 14)) << 10; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 6); + ++in; + *out |= ((*in) % (1U << 14)) << 6; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 2); + ++in; + *out |= ((*in) % (1U << 14)) << 2; + ++in; + *out |= ((*in) % (1U << 14)) << 16; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 12); + ++in; + *out |= ((*in) % (1U << 14)) << 12; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 8); + ++in; + *out |= ((*in) % (1U << 14)) << 8; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 4); + ++in; + *out |= ((*in) % (1U << 14)) << 4; + ++in; + *out |= ((*in)) << 18; + ++out; + ++in; + *out = (*in) % (1U << 14); + ++in; + *out |= ((*in) % (1U << 14)) << 14; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 10); + ++in; + *out |= ((*in) % (1U << 14)) << 10; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 6); + ++in; + *out |= ((*in) % (1U << 14)) << 6; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 2); + ++in; + *out |= ((*in) % (1U << 14)) << 2; + ++in; + *out |= ((*in) % (1U << 14)) << 16; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 12); + ++in; + *out |= ((*in) % (1U << 14)) << 12; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 8); + ++in; + *out |= ((*in) % (1U << 14)) << 8; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 14)) >> (14 - 4); + ++in; + *out |= ((*in) % (1U << 14)) << 4; + ++in; + *out |= ((*in)) << 18; +} +void __fastpack15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 15); + ++in; + *out |= ((*in) % (1U << 15)) << 15; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 13); + ++in; + *out |= ((*in) % (1U << 15)) << 13; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 11); + ++in; + *out |= ((*in) % (1U << 15)) << 11; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 9); + ++in; + *out |= ((*in) % (1U << 15)) << 9; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 7); + ++in; + *out |= ((*in) % (1U << 15)) << 7; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 5); + ++in; + *out |= ((*in) % (1U << 15)) << 5; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 3); + ++in; + *out |= ((*in) % (1U << 15)) << 3; + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 1); + ++in; + *out |= ((*in) % (1U << 15)) << 1; + ++in; + *out |= ((*in) % (1U << 15)) << 16; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 14); + ++in; + *out |= ((*in) % (1U << 15)) << 14; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 12); + ++in; + *out |= ((*in) % (1U << 15)) << 12; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 10); + ++in; + *out |= ((*in) % (1U << 15)) << 10; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 8); + ++in; + *out |= ((*in) % (1U << 15)) << 8; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 6); + ++in; + *out |= ((*in) % (1U << 15)) << 6; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 4); + ++in; + *out |= ((*in) % (1U << 15)) << 4; + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 15)) >> (15 - 2); + ++in; + *out |= ((*in) % (1U << 15)) << 2; + ++in; + *out |= ((*in)) << 17; +} -void __fastunpack11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 11) ; - out++; - *out = ((*in) >> 11) % (1U << 11) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 1)) << (11 - 1); - out++; - *out = ((*in) >> 1) % (1U << 11) ; - out++; - *out = ((*in) >> 12) % (1U << 11) ; - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 2)) << (11 - 2); - out++; - *out = ((*in) >> 2) % (1U << 11) ; - out++; - *out = ((*in) >> 13) % (1U << 11) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 3)) << (11 - 3); - out++; - *out = ((*in) >> 3) % (1U << 11) ; - out++; - *out = ((*in) >> 14) % (1U << 11) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 4)) << (11 - 4); - out++; - *out = ((*in) >> 4) % (1U << 11) ; - out++; - *out = ((*in) >> 15) % (1U << 11) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 5)) << (11 - 5); - out++; - *out = ((*in) >> 5) % (1U << 11) ; - out++; - *out = ((*in) >> 16) % (1U << 11) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 6)) << (11 - 6); - out++; - *out = ((*in) >> 6) % (1U << 11) ; - out++; - *out = ((*in) >> 17) % (1U << 11) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 7)) << (11 - 7); - out++; - *out = ((*in) >> 7) % (1U << 11) ; - out++; - *out = ((*in) >> 18) % (1U << 11) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 8)) << (11 - 8); - out++; - *out = ((*in) >> 8) % (1U << 11) ; - out++; - *out = ((*in) >> 19) % (1U << 11) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 9)) << (11 - 9); - out++; - *out = ((*in) >> 9) % (1U << 11) ; - out++; - *out = ((*in) >> 20) % (1U << 11) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 10)) << (11 - 10); - out++; - *out = ((*in) >> 10) % (1U << 11) ; - out++; - *out = ((*in) >> 21) ; +void __fastpack16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in) % (1U << 16); + ++in; + *out |= ((*in)) << 16; } +void __fastpack17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 2); + ++in; + *out |= ((*in) % (1U << 17)) << 2; + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 4); + ++in; + *out |= ((*in) % (1U << 17)) << 4; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 6); + ++in; + *out |= ((*in) % (1U << 17)) << 6; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 8); + ++in; + *out |= ((*in) % (1U << 17)) << 8; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 10); + ++in; + *out |= ((*in) % (1U << 17)) << 10; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 12); + ++in; + *out |= ((*in) % (1U << 17)) << 12; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 14); + ++in; + *out |= ((*in) % (1U << 17)) << 14; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 1); + ++in; + *out |= ((*in) % (1U << 17)) << 1; + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 3); + ++in; + *out |= ((*in) % (1U << 17)) << 3; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 5); + ++in; + *out |= ((*in) % (1U << 17)) << 5; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 7); + ++in; + *out |= ((*in) % (1U << 17)) << 7; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 9); + ++in; + *out |= ((*in) % (1U << 17)) << 9; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 11); + ++in; + *out |= ((*in) % (1U << 17)) << 11; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 13); + ++in; + *out |= ((*in) % (1U << 17)) << 13; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 17)) >> (17 - 15); + ++in; + *out |= ((*in)) << 15; +} +void __fastpack18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 4); + ++in; + *out |= ((*in) % (1U << 18)) << 4; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 8); + ++in; + *out |= ((*in) % (1U << 18)) << 8; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 12); + ++in; + *out |= ((*in) % (1U << 18)) << 12; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 2); + ++in; + *out |= ((*in) % (1U << 18)) << 2; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 6); + ++in; + *out |= ((*in) % (1U << 18)) << 6; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 10); + ++in; + *out |= ((*in) % (1U << 18)) << 10; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + ++in; + *out = (*in) % (1U << 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 4); + ++in; + *out |= ((*in) % (1U << 18)) << 4; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 8); + ++in; + *out |= ((*in) % (1U << 18)) << 8; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 12); + ++in; + *out |= ((*in) % (1U << 18)) << 12; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 2); + ++in; + *out |= ((*in) % (1U << 18)) << 2; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 6); + ++in; + *out |= ((*in) % (1U << 18)) << 6; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 10); + ++in; + *out |= ((*in) % (1U << 18)) << 10; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 18)) >> (18 - 14); + ++in; + *out |= ((*in)) << 14; +} +void __fastpack19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 6); + ++in; + *out |= ((*in) % (1U << 19)) << 6; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 12); + ++in; + *out |= ((*in) % (1U << 19)) << 12; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 5); + ++in; + *out |= ((*in) % (1U << 19)) << 5; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 11); + ++in; + *out |= ((*in) % (1U << 19)) << 11; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 4); + ++in; + *out |= ((*in) % (1U << 19)) << 4; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 10); + ++in; + *out |= ((*in) % (1U << 19)) << 10; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 3); + ++in; + *out |= ((*in) % (1U << 19)) << 3; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 9); + ++in; + *out |= ((*in) % (1U << 19)) << 9; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 2); + ++in; + *out |= ((*in) % (1U << 19)) << 2; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 8); + ++in; + *out |= ((*in) % (1U << 19)) << 8; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 1); + ++in; + *out |= ((*in) % (1U << 19)) << 1; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 7); + ++in; + *out |= ((*in) % (1U << 19)) << 7; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 19)) >> (19 - 13); + ++in; + *out |= ((*in)) << 13; +} -void __fastunpack12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 12) ; - out++; - *out = ((*in) >> 12) % (1U << 12) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12) ; - out++; - *out = ((*in) >> 16) % (1U << 12) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12) ; - out++; - *out = ((*in) >> 20) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12) ; - out++; - *out = ((*in) >> 12) % (1U << 12) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12) ; - out++; - *out = ((*in) >> 16) % (1U << 12) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12) ; - out++; - *out = ((*in) >> 20) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12) ; - out++; - *out = ((*in) >> 12) % (1U << 12) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12) ; - out++; - *out = ((*in) >> 16) % (1U << 12) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12) ; - out++; - *out = ((*in) >> 20) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12) ; - out++; - *out = ((*in) >> 12) % (1U << 12) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12) ; - out++; - *out = ((*in) >> 16) % (1U << 12) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12) ; - out++; - *out = ((*in) >> 20) ; +void __fastpack20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 8); + ++in; + *out |= ((*in) % (1U << 20)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 4); + ++in; + *out |= ((*in) % (1U << 20)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + *out = (*in) % (1U << 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 8); + ++in; + *out |= ((*in) % (1U << 20)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 4); + ++in; + *out |= ((*in) % (1U << 20)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + *out = (*in) % (1U << 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 8); + ++in; + *out |= ((*in) % (1U << 20)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 4); + ++in; + *out |= ((*in) % (1U << 20)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + *out = (*in) % (1U << 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 8); + ++in; + *out |= ((*in) % (1U << 20)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 4); + ++in; + *out |= ((*in) % (1U << 20)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 20)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; } +void __fastpack21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 10); + ++in; + *out |= ((*in) % (1U << 21)) << 10; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 9); + ++in; + *out |= ((*in) % (1U << 21)) << 9; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 8); + ++in; + *out |= ((*in) % (1U << 21)) << 8; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 7); + ++in; + *out |= ((*in) % (1U << 21)) << 7; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 6); + ++in; + *out |= ((*in) % (1U << 21)) << 6; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 5); + ++in; + *out |= ((*in) % (1U << 21)) << 5; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 4); + ++in; + *out |= ((*in) % (1U << 21)) << 4; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 3); + ++in; + *out |= ((*in) % (1U << 21)) << 3; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 2); + ++in; + *out |= ((*in) % (1U << 21)) << 2; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 1); + ++in; + *out |= ((*in) % (1U << 21)) << 1; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 21)) >> (21 - 11); + ++in; + *out |= ((*in)) << 11; +} +void __fastpack22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 2); + ++in; + *out |= ((*in) % (1U << 22)) << 2; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 4); + ++in; + *out |= ((*in) % (1U << 22)) << 4; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 6); + ++in; + *out |= ((*in) % (1U << 22)) << 6; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 8); + ++in; + *out |= ((*in) % (1U << 22)) << 8; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + ++in; + *out = (*in) % (1U << 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 2); + ++in; + *out |= ((*in) % (1U << 22)) << 2; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 4); + ++in; + *out |= ((*in) % (1U << 22)) << 4; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 6); + ++in; + *out |= ((*in) % (1U << 22)) << 6; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 8); + ++in; + *out |= ((*in) % (1U << 22)) << 8; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 22)) >> (22 - 10); + ++in; + *out |= ((*in)) << 10; +} +void __fastpack23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 5); + ++in; + *out |= ((*in) % (1U << 23)) << 5; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 1); + ++in; + *out |= ((*in) % (1U << 23)) << 1; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 6); + ++in; + *out |= ((*in) % (1U << 23)) << 6; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 2); + ++in; + *out |= ((*in) % (1U << 23)) << 2; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 7); + ++in; + *out |= ((*in) % (1U << 23)) << 7; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 3); + ++in; + *out |= ((*in) % (1U << 23)) << 3; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 8); + ++in; + *out |= ((*in) % (1U << 23)) << 8; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 4); + ++in; + *out |= ((*in) % (1U << 23)) << 4; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 23)) >> (23 - 9); + ++in; + *out |= ((*in)) << 9; +} -void __fastunpack13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 13) ; - out++; - *out = ((*in) >> 13) % (1U << 13) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 7)) << (13 - 7); - out++; - *out = ((*in) >> 7) % (1U << 13) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 1)) << (13 - 1); - out++; - *out = ((*in) >> 1) % (1U << 13) ; - out++; - *out = ((*in) >> 14) % (1U << 13) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 8)) << (13 - 8); - out++; - *out = ((*in) >> 8) % (1U << 13) ; - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 2)) << (13 - 2); - out++; - *out = ((*in) >> 2) % (1U << 13) ; - out++; - *out = ((*in) >> 15) % (1U << 13) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 9)) << (13 - 9); - out++; - *out = ((*in) >> 9) % (1U << 13) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 3)) << (13 - 3); - out++; - *out = ((*in) >> 3) % (1U << 13) ; - out++; - *out = ((*in) >> 16) % (1U << 13) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 10)) << (13 - 10); - out++; - *out = ((*in) >> 10) % (1U << 13) ; - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 4)) << (13 - 4); - out++; - *out = ((*in) >> 4) % (1U << 13) ; - out++; - *out = ((*in) >> 17) % (1U << 13) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 11)) << (13 - 11); - out++; - *out = ((*in) >> 11) % (1U << 13) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 5)) << (13 - 5); - out++; - *out = ((*in) >> 5) % (1U << 13) ; - out++; - *out = ((*in) >> 18) % (1U << 13) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 12)) << (13 - 12); - out++; - *out = ((*in) >> 12) % (1U << 13) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 6)) << (13 - 6); - out++; - *out = ((*in) >> 6) % (1U << 13) ; - out++; - *out = ((*in) >> 19) ; +void __fastpack24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in) % (1U << 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in) % (1U << 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in) % (1U << 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in) % (1U << 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in) % (1U << 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in) % (1U << 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in) % (1U << 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; } +void __fastpack25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 4); + ++in; + *out |= ((*in) % (1U << 25)) << 4; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 1); + ++in; + *out |= ((*in) % (1U << 25)) << 1; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 5); + ++in; + *out |= ((*in) % (1U << 25)) << 5; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 2); + ++in; + *out |= ((*in) % (1U << 25)) << 2; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 6); + ++in; + *out |= ((*in) % (1U << 25)) << 6; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 3); + ++in; + *out |= ((*in) % (1U << 25)) << 3; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 25)) >> (25 - 7); + ++in; + *out |= ((*in)) << 7; +} +void __fastpack26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 2); + ++in; + *out |= ((*in) % (1U << 26)) << 2; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 4); + ++in; + *out |= ((*in) % (1U << 26)) << 4; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + ++in; + *out = (*in) % (1U << 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 2); + ++in; + *out |= ((*in) % (1U << 26)) << 2; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 4); + ++in; + *out |= ((*in) % (1U << 26)) << 4; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 26)) >> (26 - 6); + ++in; + *out |= ((*in)) << 6; +} +void __fastpack27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 27); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 7); + ++in; + *out |= ((*in)) << 7; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 2); + ++in; + *out |= ((*in) % (1U << 27)) << 2; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 4); + ++in; + *out |= ((*in) % (1U << 27)) << 4; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 1); + ++in; + *out |= ((*in) % (1U << 27)) << 1; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 3); + ++in; + *out |= ((*in) % (1U << 27)) << 3; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 27)) >> (27 - 5); + ++in; + *out |= ((*in)) << 5; +} -void __fastunpack14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 14) ; - out++; - *out = ((*in) >> 14) % (1U << 14) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 10)) << (14 - 10); - out++; - *out = ((*in) >> 10) % (1U << 14) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 6)) << (14 - 6); - out++; - *out = ((*in) >> 6) % (1U << 14) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 2)) << (14 - 2); - out++; - *out = ((*in) >> 2) % (1U << 14) ; - out++; - *out = ((*in) >> 16) % (1U << 14) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 12)) << (14 - 12); - out++; - *out = ((*in) >> 12) % (1U << 14) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 8)) << (14 - 8); - out++; - *out = ((*in) >> 8) % (1U << 14) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 4)) << (14 - 4); - out++; - *out = ((*in) >> 4) % (1U << 14) ; - out++; - *out = ((*in) >> 18) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 14) ; - out++; - *out = ((*in) >> 14) % (1U << 14) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 10)) << (14 - 10); - out++; - *out = ((*in) >> 10) % (1U << 14) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 6)) << (14 - 6); - out++; - *out = ((*in) >> 6) % (1U << 14) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 2)) << (14 - 2); - out++; - *out = ((*in) >> 2) % (1U << 14) ; - out++; - *out = ((*in) >> 16) % (1U << 14) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 12)) << (14 - 12); - out++; - *out = ((*in) >> 12) % (1U << 14) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 8)) << (14 - 8); - out++; - *out = ((*in) >> 8) % (1U << 14) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 4)) << (14 - 4); - out++; - *out = ((*in) >> 4) % (1U << 14) ; - out++; - *out = ((*in) >> 18) ; +void __fastpack28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + *out = (*in) % (1U << 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + *out = (*in) % (1U << 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + *out = (*in) % (1U << 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 28)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; } +void __fastpack29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 29); + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 5); + ++in; + *out |= ((*in)) << 5; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 2); + ++in; + *out |= ((*in) % (1U << 29)) << 2; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 7); + ++in; + *out |= ((*in)) << 7; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 1); + ++in; + *out |= ((*in) % (1U << 29)) << 1; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 27); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in) % (1U << 29)) >> (29 - 3); + ++in; + *out |= ((*in)) << 3; +} +void __fastpack30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 30); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 2); + ++in; + *out |= ((*in)) << 2; + ++out; + ++in; + *out = (*in) % (1U << 30); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in) % (1U << 30)) >> (30 - 2); + ++in; + *out |= ((*in)) << 2; +} +void __fastpack31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = (*in) % (1U << 31); + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 30); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 29); + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 27); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 7); + ++in; + *out |= ((*in)) << 7; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 5); + ++in; + *out |= ((*in)) << 5; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 3); + ++in; + *out |= ((*in)) << 3; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 2); + ++in; + *out |= ((*in)) << 2; + ++out; + *out = ((*in) % (1U << 31)) >> (31 - 1); + ++in; + *out |= ((*in)) << 1; +} -void __fastunpack15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 15) ; - out++; - *out = ((*in) >> 15) % (1U << 15) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 13)) << (15 - 13); - out++; - *out = ((*in) >> 13) % (1U << 15) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 11)) << (15 - 11); - out++; - *out = ((*in) >> 11) % (1U << 15) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 9)) << (15 - 9); - out++; - *out = ((*in) >> 9) % (1U << 15) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 7)) << (15 - 7); - out++; - *out = ((*in) >> 7) % (1U << 15) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 5)) << (15 - 5); - out++; - *out = ((*in) >> 5) % (1U << 15) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 3)) << (15 - 3); - out++; - *out = ((*in) >> 3) % (1U << 15) ; - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 1)) << (15 - 1); - out++; - *out = ((*in) >> 1) % (1U << 15) ; - out++; - *out = ((*in) >> 16) % (1U << 15) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 14)) << (15 - 14); - out++; - *out = ((*in) >> 14) % (1U << 15) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 12)) << (15 - 12); - out++; - *out = ((*in) >> 12) % (1U << 15) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 10)) << (15 - 10); - out++; - *out = ((*in) >> 10) % (1U << 15) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 8)) << (15 - 8); - out++; - *out = ((*in) >> 8) % (1U << 15) ; - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 6)) << (15 - 6); - out++; - *out = ((*in) >> 6) % (1U << 15) ; - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 4)) << (15 - 4); - out++; - *out = ((*in) >> 4) % (1U << 15) ; - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 2)) << (15 - 2); - out++; - *out = ((*in) >> 2) % (1U << 15) ; - out++; - *out = ((*in) >> 17) ; +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask1(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 19; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 23; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 25; + ++in; + *out |= ((*in)) << 26; + ++in; + *out |= ((*in)) << 27; + ++in; + *out |= ((*in)) << 28; + ++in; + *out |= ((*in)) << 29; + ++in; + *out |= ((*in)) << 30; + ++in; + *out |= ((*in)) << 31; } +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask2(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 26; + ++in; + *out |= ((*in)) << 28; + ++in; + *out |= ((*in)) << 30; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 26; + ++in; + *out |= ((*in)) << 28; + ++in; + *out |= ((*in)) << 30; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask3(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 27; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (3 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 19; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 25; + ++in; + *out |= ((*in)) << 28; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (3 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 23; + ++in; + *out |= ((*in)) << 26; + ++in; + *out |= ((*in)) << 29; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask4(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 28; +} -void __fastunpack17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 17) ; - out++; - *out = ((*in) >> 17) ; - ++in; - *out |= ((*in) % (1U << 2)) << (17 - 2); - out++; - *out = ((*in) >> 2) % (1U << 17) ; - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 4)) << (17 - 4); - out++; - *out = ((*in) >> 4) % (1U << 17) ; - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 6)) << (17 - 6); - out++; - *out = ((*in) >> 6) % (1U << 17) ; - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 8)) << (17 - 8); - out++; - *out = ((*in) >> 8) % (1U << 17) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 10)) << (17 - 10); - out++; - *out = ((*in) >> 10) % (1U << 17) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 12)) << (17 - 12); - out++; - *out = ((*in) >> 12) % (1U << 17) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 14)) << (17 - 14); - out++; - *out = ((*in) >> 14) % (1U << 17) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 16)) << (17 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 1)) << (17 - 1); - out++; - *out = ((*in) >> 1) % (1U << 17) ; - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 3)) << (17 - 3); - out++; - *out = ((*in) >> 3) % (1U << 17) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 5)) << (17 - 5); - out++; - *out = ((*in) >> 5) % (1U << 17) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 7)) << (17 - 7); - out++; - *out = ((*in) >> 7) % (1U << 17) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 9)) << (17 - 9); - out++; - *out = ((*in) >> 9) % (1U << 17) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 11)) << (17 - 11); - out++; - *out = ((*in) >> 11) % (1U << 17) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 13)) << (17 - 13); - out++; - *out = ((*in) >> 13) % (1U << 17) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 15)) << (17 - 15); - out++; - *out = ((*in) >> 15) ; +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask5(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 25; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (5 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 23; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (5 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 26; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (5 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 19; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (5 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 27; } +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask6(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (6 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (6 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 26; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (6 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (6 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 26; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask7(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (7 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (7 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (7 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 23; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (7 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 19; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (7 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (7 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 25; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; +} -void __fastunpack18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 18) ; - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 4)) << (18 - 4); - out++; - *out = ((*in) >> 4) % (1U << 18) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 8)) << (18 - 8); - out++; - *out = ((*in) >> 8) % (1U << 18) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 12)) << (18 - 12); - out++; - *out = ((*in) >> 12) % (1U << 18) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 16)) << (18 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 2)) << (18 - 2); - out++; - *out = ((*in) >> 2) % (1U << 18) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 6)) << (18 - 6); - out++; - *out = ((*in) >> 6) % (1U << 18) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 10)) << (18 - 10); - out++; - *out = ((*in) >> 10) % (1U << 18) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 14)) << (18 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 18) ; - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 4)) << (18 - 4); - out++; - *out = ((*in) >> 4) % (1U << 18) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 8)) << (18 - 8); - out++; - *out = ((*in) >> 8) % (1U << 18) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 12)) << (18 - 12); - out++; - *out = ((*in) >> 12) % (1U << 18) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 16)) << (18 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 2)) << (18 - 2); - out++; - *out = ((*in) >> 2) % (1U << 18) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 6)) << (18 - 6); - out++; - *out = ((*in) >> 6) % (1U << 18) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 10)) << (18 - 10); - out++; - *out = ((*in) >> 10) % (1U << 18) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 14)) << (18 - 14); - out++; - *out = ((*in) >> 14) ; +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask9(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (9 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (9 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (9 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (9 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (9 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (9 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (9 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 19; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (9 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 23; } +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask10(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (10 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (10 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (10 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (10 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 22; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (10 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (10 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (10 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (10 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 22; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask11(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (11 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (11 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (11 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (11 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (11 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (11 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (11 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (11 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 19; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (11 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (11 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 21; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask12(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (12 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (12 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (12 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (12 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (12 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (12 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (12 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (12 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 20; +} -void __fastunpack19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 19) ; - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 6)) << (19 - 6); - out++; - *out = ((*in) >> 6) % (1U << 19) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 12)) << (19 - 12); - out++; - *out = ((*in) >> 12) % (1U << 19) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 18)) << (19 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 5)) << (19 - 5); - out++; - *out = ((*in) >> 5) % (1U << 19) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 11)) << (19 - 11); - out++; - *out = ((*in) >> 11) % (1U << 19) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 17)) << (19 - 17); - out++; - *out = ((*in) >> 17) ; - ++in; - *out |= ((*in) % (1U << 4)) << (19 - 4); - out++; - *out = ((*in) >> 4) % (1U << 19) ; - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 10)) << (19 - 10); - out++; - *out = ((*in) >> 10) % (1U << 19) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 16)) << (19 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 3)) << (19 - 3); - out++; - *out = ((*in) >> 3) % (1U << 19) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 9)) << (19 - 9); - out++; - *out = ((*in) >> 9) % (1U << 19) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 15)) << (19 - 15); - out++; - *out = ((*in) >> 15) ; - ++in; - *out |= ((*in) % (1U << 2)) << (19 - 2); - out++; - *out = ((*in) >> 2) % (1U << 19) ; - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 8)) << (19 - 8); - out++; - *out = ((*in) >> 8) % (1U << 19) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 14)) << (19 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 1)) << (19 - 1); - out++; - *out = ((*in) >> 1) % (1U << 19) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 7)) << (19 - 7); - out++; - *out = ((*in) >> 7) % (1U << 19) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 13)) << (19 - 13); - out++; - *out = ((*in) >> 13) ; +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask13(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (13 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (13 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (13 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (13 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (13 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (13 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (13 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (13 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (13 - 11); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (13 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (13 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (13 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 19; } +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask14(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (14 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (14 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (14 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (14 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (14 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (14 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 18; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (14 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (14 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (14 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (14 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (14 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (14 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 18; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask15(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (15 - 13); + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (15 - 11); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (15 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (15 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (15 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (15 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (15 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (15 - 14); + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (15 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (15 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (15 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (15 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (15 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (15 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 17; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; +} -void __fastunpack20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 20) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20) ; - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12) ; +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask17(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (17 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (17 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (17 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (17 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (17 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (17 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (17 - 14); + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (17 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (17 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (17 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (17 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (17 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (17 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (17 - 11); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (17 - 13); + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (17 - 15); + ++in; + *out |= ((*in)) << 15; } +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask18(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (18 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (18 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (18 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (18 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (18 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (18 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (18 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (18 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (18 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (18 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (18 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (18 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (18 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (18 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (18 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (18 - 14); + ++in; + *out |= ((*in)) << 14; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask19(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (19 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (19 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (19 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (19 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (19 - 11); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (19 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (19 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (19 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (19 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (19 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (19 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (19 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (19 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (19 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (19 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (19 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (19 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (19 - 13); + ++in; + *out |= ((*in)) << 13; +} +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask20(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (20 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (20 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (20 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (20 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (20 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (20 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (20 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (20 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; +} -void __fastunpack21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 21) ; - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 10)) << (21 - 10); - out++; - *out = ((*in) >> 10) % (1U << 21) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 20)) << (21 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 9)) << (21 - 9); - out++; - *out = ((*in) >> 9) % (1U << 21) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 19)) << (21 - 19); - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 8)) << (21 - 8); - out++; - *out = ((*in) >> 8) % (1U << 21) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 18)) << (21 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 7)) << (21 - 7); - out++; - *out = ((*in) >> 7) % (1U << 21) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 17)) << (21 - 17); - out++; - *out = ((*in) >> 17) ; - ++in; - *out |= ((*in) % (1U << 6)) << (21 - 6); - out++; - *out = ((*in) >> 6) % (1U << 21) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 16)) << (21 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 5)) << (21 - 5); - out++; - *out = ((*in) >> 5) % (1U << 21) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 15)) << (21 - 15); - out++; - *out = ((*in) >> 15) ; - ++in; - *out |= ((*in) % (1U << 4)) << (21 - 4); - out++; - *out = ((*in) >> 4) % (1U << 21) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 14)) << (21 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 3)) << (21 - 3); - out++; - *out = ((*in) >> 3) % (1U << 21) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 13)) << (21 - 13); - out++; - *out = ((*in) >> 13) ; - ++in; - *out |= ((*in) % (1U << 2)) << (21 - 2); - out++; - *out = ((*in) >> 2) % (1U << 21) ; - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 12)) << (21 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 1)) << (21 - 1); - out++; - *out = ((*in) >> 1) % (1U << 21) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 11)) << (21 - 11); - out++; - *out = ((*in) >> 11) ; -} - - - - -void __fastunpack22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 22) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 12)) << (22 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 2)) << (22 - 2); - out++; - *out = ((*in) >> 2) % (1U << 22) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 14)) << (22 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 4)) << (22 - 4); - out++; - *out = ((*in) >> 4) % (1U << 22) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 16)) << (22 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 6)) << (22 - 6); - out++; - *out = ((*in) >> 6) % (1U << 22) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 18)) << (22 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 8)) << (22 - 8); - out++; - *out = ((*in) >> 8) % (1U << 22) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 20)) << (22 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 10)) << (22 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 22) ; - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 12)) << (22 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 2)) << (22 - 2); - out++; - *out = ((*in) >> 2) % (1U << 22) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 14)) << (22 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 4)) << (22 - 4); - out++; - *out = ((*in) >> 4) % (1U << 22) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 16)) << (22 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 6)) << (22 - 6); - out++; - *out = ((*in) >> 6) % (1U << 22) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 18)) << (22 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 8)) << (22 - 8); - out++; - *out = ((*in) >> 8) % (1U << 22) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 20)) << (22 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 10)) << (22 - 10); - out++; - *out = ((*in) >> 10) ; -} - - - - -void __fastunpack23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 23) ; - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 14)) << (23 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 5)) << (23 - 5); - out++; - *out = ((*in) >> 5) % (1U << 23) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 19)) << (23 - 19); - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 10)) << (23 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 1)) << (23 - 1); - out++; - *out = ((*in) >> 1) % (1U << 23) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 15)) << (23 - 15); - out++; - *out = ((*in) >> 15) ; - ++in; - *out |= ((*in) % (1U << 6)) << (23 - 6); - out++; - *out = ((*in) >> 6) % (1U << 23) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 20)) << (23 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 11)) << (23 - 11); - out++; - *out = ((*in) >> 11) ; - ++in; - *out |= ((*in) % (1U << 2)) << (23 - 2); - out++; - *out = ((*in) >> 2) % (1U << 23) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 16)) << (23 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 7)) << (23 - 7); - out++; - *out = ((*in) >> 7) % (1U << 23) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 21)) << (23 - 21); - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 12)) << (23 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 3)) << (23 - 3); - out++; - *out = ((*in) >> 3) % (1U << 23) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 17)) << (23 - 17); - out++; - *out = ((*in) >> 17) ; - ++in; - *out |= ((*in) % (1U << 8)) << (23 - 8); - out++; - *out = ((*in) >> 8) % (1U << 23) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 22)) << (23 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 13)) << (23 - 13); - out++; - *out = ((*in) >> 13) ; - ++in; - *out |= ((*in) % (1U << 4)) << (23 - 4); - out++; - *out = ((*in) >> 4) % (1U << 23) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 18)) << (23 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 9)) << (23 - 9); - out++; - *out = ((*in) >> 9) ; -} - - - - -void __fastunpack24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 24) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24) ; - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8) ; -} - - - - -void __fastunpack25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 25) ; - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 18)) << (25 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 11)) << (25 - 11); - out++; - *out = ((*in) >> 11) ; - ++in; - *out |= ((*in) % (1U << 4)) << (25 - 4); - out++; - *out = ((*in) >> 4) % (1U << 25) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 22)) << (25 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 15)) << (25 - 15); - out++; - *out = ((*in) >> 15) ; - ++in; - *out |= ((*in) % (1U << 8)) << (25 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 1)) << (25 - 1); - out++; - *out = ((*in) >> 1) % (1U << 25) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 19)) << (25 - 19); - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 12)) << (25 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 5)) << (25 - 5); - out++; - *out = ((*in) >> 5) % (1U << 25) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 23)) << (25 - 23); - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 16)) << (25 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 9)) << (25 - 9); - out++; - *out = ((*in) >> 9) ; - ++in; - *out |= ((*in) % (1U << 2)) << (25 - 2); - out++; - *out = ((*in) >> 2) % (1U << 25) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 20)) << (25 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 13)) << (25 - 13); - out++; - *out = ((*in) >> 13) ; - ++in; - *out |= ((*in) % (1U << 6)) << (25 - 6); - out++; - *out = ((*in) >> 6) % (1U << 25) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 24)) << (25 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 17)) << (25 - 17); - out++; - *out = ((*in) >> 17) ; - ++in; - *out |= ((*in) % (1U << 10)) << (25 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 3)) << (25 - 3); - out++; - *out = ((*in) >> 3) % (1U << 25) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 21)) << (25 - 21); - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 14)) << (25 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 7)) << (25 - 7); - out++; - *out = ((*in) >> 7) ; -} - - - - -void __fastunpack26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 26) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 20)) << (26 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 14)) << (26 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 8)) << (26 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 2)) << (26 - 2); - out++; - *out = ((*in) >> 2) % (1U << 26) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 22)) << (26 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 16)) << (26 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 10)) << (26 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 4)) << (26 - 4); - out++; - *out = ((*in) >> 4) % (1U << 26) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 24)) << (26 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 18)) << (26 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 12)) << (26 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 6)) << (26 - 6); - out++; - *out = ((*in) >> 6) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 26) ; - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 20)) << (26 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 14)) << (26 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 8)) << (26 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 2)) << (26 - 2); - out++; - *out = ((*in) >> 2) % (1U << 26) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 22)) << (26 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 16)) << (26 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 10)) << (26 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 4)) << (26 - 4); - out++; - *out = ((*in) >> 4) % (1U << 26) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 24)) << (26 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 18)) << (26 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 12)) << (26 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 6)) << (26 - 6); - out++; - *out = ((*in) >> 6) ; -} - - - - -void __fastunpack27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 27) ; - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 22)) << (27 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 17)) << (27 - 17); - out++; - *out = ((*in) >> 17) ; - ++in; - *out |= ((*in) % (1U << 12)) << (27 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 7)) << (27 - 7); - out++; - *out = ((*in) >> 7) ; - ++in; - *out |= ((*in) % (1U << 2)) << (27 - 2); - out++; - *out = ((*in) >> 2) % (1U << 27) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 24)) << (27 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 19)) << (27 - 19); - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 14)) << (27 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 9)) << (27 - 9); - out++; - *out = ((*in) >> 9) ; - ++in; - *out |= ((*in) % (1U << 4)) << (27 - 4); - out++; - *out = ((*in) >> 4) % (1U << 27) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 26)) << (27 - 26); - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 21)) << (27 - 21); - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 16)) << (27 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 11)) << (27 - 11); - out++; - *out = ((*in) >> 11) ; - ++in; - *out |= ((*in) % (1U << 6)) << (27 - 6); - out++; - *out = ((*in) >> 6) ; - ++in; - *out |= ((*in) % (1U << 1)) << (27 - 1); - out++; - *out = ((*in) >> 1) % (1U << 27) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 23)) << (27 - 23); - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 18)) << (27 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 13)) << (27 - 13); - out++; - *out = ((*in) >> 13) ; - ++in; - *out |= ((*in) % (1U << 8)) << (27 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 3)) << (27 - 3); - out++; - *out = ((*in) >> 3) % (1U << 27) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 25)) << (27 - 25); - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 20)) << (27 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 15)) << (27 - 15); - out++; - *out = ((*in) >> 15) ; - ++in; - *out |= ((*in) % (1U << 10)) << (27 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 5)) << (27 - 5); - out++; - *out = ((*in) >> 5) ; -} - - - - -void __fastunpack28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 28) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28) ; - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4) ; -} - - - - -void __fastunpack29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 29) ; - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 26)) << (29 - 26); - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 23)) << (29 - 23); - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 20)) << (29 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 17)) << (29 - 17); - out++; - *out = ((*in) >> 17) ; - ++in; - *out |= ((*in) % (1U << 14)) << (29 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 11)) << (29 - 11); - out++; - *out = ((*in) >> 11) ; - ++in; - *out |= ((*in) % (1U << 8)) << (29 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 5)) << (29 - 5); - out++; - *out = ((*in) >> 5) ; - ++in; - *out |= ((*in) % (1U << 2)) << (29 - 2); - out++; - *out = ((*in) >> 2) % (1U << 29) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 28)) << (29 - 28); - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 25)) << (29 - 25); - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 22)) << (29 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 19)) << (29 - 19); - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 16)) << (29 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 13)) << (29 - 13); - out++; - *out = ((*in) >> 13) ; - ++in; - *out |= ((*in) % (1U << 10)) << (29 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 7)) << (29 - 7); - out++; - *out = ((*in) >> 7) ; - ++in; - *out |= ((*in) % (1U << 4)) << (29 - 4); - out++; - *out = ((*in) >> 4) ; - ++in; - *out |= ((*in) % (1U << 1)) << (29 - 1); - out++; - *out = ((*in) >> 1) % (1U << 29) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 27)) << (29 - 27); - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 24)) << (29 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 21)) << (29 - 21); - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 18)) << (29 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 15)) << (29 - 15); - out++; - *out = ((*in) >> 15) ; - ++in; - *out |= ((*in) % (1U << 12)) << (29 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 9)) << (29 - 9); - out++; - *out = ((*in) >> 9) ; - ++in; - *out |= ((*in) % (1U << 6)) << (29 - 6); - out++; - *out = ((*in) >> 6) ; - ++in; - *out |= ((*in) % (1U << 3)) << (29 - 3); - out++; - *out = ((*in) >> 3) ; -} - - - - -void __fastunpack30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 30) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 28)) << (30 - 28); - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 26)) << (30 - 26); - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 24)) << (30 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 22)) << (30 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 20)) << (30 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 18)) << (30 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 16)) << (30 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 14)) << (30 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 12)) << (30 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 10)) << (30 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 8)) << (30 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 6)) << (30 - 6); - out++; - *out = ((*in) >> 6) ; - ++in; - *out |= ((*in) % (1U << 4)) << (30 - 4); - out++; - *out = ((*in) >> 4) ; - ++in; - *out |= ((*in) % (1U << 2)) << (30 - 2); - out++; - *out = ((*in) >> 2) ; - ++in; - out++; - *out = ((*in) >> 0) % (1U << 30) ; - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 28)) << (30 - 28); - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 26)) << (30 - 26); - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 24)) << (30 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 22)) << (30 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 20)) << (30 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 18)) << (30 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 16)) << (30 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 14)) << (30 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 12)) << (30 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 10)) << (30 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 8)) << (30 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 6)) << (30 - 6); - out++; - *out = ((*in) >> 6) ; - ++in; - *out |= ((*in) % (1U << 4)) << (30 - 4); - out++; - *out = ((*in) >> 4) ; - ++in; - *out |= ((*in) % (1U << 2)) << (30 - 2); - out++; - *out = ((*in) >> 2) ; -} - - - - -void __fastunpack31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = ((*in) >> 0) % (1U << 31) ; - out++; - *out = ((*in) >> 31) ; - ++in; - *out |= ((*in) % (1U << 30)) << (31 - 30); - out++; - *out = ((*in) >> 30) ; - ++in; - *out |= ((*in) % (1U << 29)) << (31 - 29); - out++; - *out = ((*in) >> 29) ; - ++in; - *out |= ((*in) % (1U << 28)) << (31 - 28); - out++; - *out = ((*in) >> 28) ; - ++in; - *out |= ((*in) % (1U << 27)) << (31 - 27); - out++; - *out = ((*in) >> 27) ; - ++in; - *out |= ((*in) % (1U << 26)) << (31 - 26); - out++; - *out = ((*in) >> 26) ; - ++in; - *out |= ((*in) % (1U << 25)) << (31 - 25); - out++; - *out = ((*in) >> 25) ; - ++in; - *out |= ((*in) % (1U << 24)) << (31 - 24); - out++; - *out = ((*in) >> 24) ; - ++in; - *out |= ((*in) % (1U << 23)) << (31 - 23); - out++; - *out = ((*in) >> 23) ; - ++in; - *out |= ((*in) % (1U << 22)) << (31 - 22); - out++; - *out = ((*in) >> 22) ; - ++in; - *out |= ((*in) % (1U << 21)) << (31 - 21); - out++; - *out = ((*in) >> 21) ; - ++in; - *out |= ((*in) % (1U << 20)) << (31 - 20); - out++; - *out = ((*in) >> 20) ; - ++in; - *out |= ((*in) % (1U << 19)) << (31 - 19); - out++; - *out = ((*in) >> 19) ; - ++in; - *out |= ((*in) % (1U << 18)) << (31 - 18); - out++; - *out = ((*in) >> 18) ; - ++in; - *out |= ((*in) % (1U << 17)) << (31 - 17); - out++; - *out = ((*in) >> 17) ; - ++in; - *out |= ((*in) % (1U << 16)) << (31 - 16); - out++; - *out = ((*in) >> 16) ; - ++in; - *out |= ((*in) % (1U << 15)) << (31 - 15); - out++; - *out = ((*in) >> 15) ; - ++in; - *out |= ((*in) % (1U << 14)) << (31 - 14); - out++; - *out = ((*in) >> 14) ; - ++in; - *out |= ((*in) % (1U << 13)) << (31 - 13); - out++; - *out = ((*in) >> 13) ; - ++in; - *out |= ((*in) % (1U << 12)) << (31 - 12); - out++; - *out = ((*in) >> 12) ; - ++in; - *out |= ((*in) % (1U << 11)) << (31 - 11); - out++; - *out = ((*in) >> 11) ; - ++in; - *out |= ((*in) % (1U << 10)) << (31 - 10); - out++; - *out = ((*in) >> 10) ; - ++in; - *out |= ((*in) % (1U << 9)) << (31 - 9); - out++; - *out = ((*in) >> 9) ; - ++in; - *out |= ((*in) % (1U << 8)) << (31 - 8); - out++; - *out = ((*in) >> 8) ; - ++in; - *out |= ((*in) % (1U << 7)) << (31 - 7); - out++; - *out = ((*in) >> 7) ; - ++in; - *out |= ((*in) % (1U << 6)) << (31 - 6); - out++; - *out = ((*in) >> 6) ; - ++in; - *out |= ((*in) % (1U << 5)) << (31 - 5); - out++; - *out = ((*in) >> 5) ; - ++in; - *out |= ((*in) % (1U << 4)) << (31 - 4); - out++; - *out = ((*in) >> 4) ; - ++in; - *out |= ((*in) % (1U << 3)) << (31 - 3); - out++; - *out = ((*in) >> 3) ; - ++in; - *out |= ((*in) % (1U << 2)) << (31 - 2); - out++; - *out = ((*in) >> 2) ; - ++in; - *out |= ((*in) % (1U << 1)) << (31 - 1); - out++; - *out = ((*in) >> 1) ; -} - - - - -void __fastunpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - for (uint32_t outer = 0; outer < 4 ; ++outer) { - for (uint32_t inwordpointer = 0 ; inwordpointer < 32; inwordpointer += 4) - * (out++) = ((*in) >> inwordpointer) % (1U << 4) ; - ++in; - } -} - - - - -void __fastunpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - for (uint32_t outer = 0; outer < 8 ; ++outer) { - for (uint32_t inwordpointer = 0 ; inwordpointer < 32; inwordpointer += 8) - * (out++) = ((*in) >> inwordpointer) % (1U << 8) ; - ++in; - } -} - - - - -void __fastunpack16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - for (uint32_t outer = 0; outer < 16 ; ++outer) { - for (uint32_t inwordpointer = 0 ; inwordpointer < 32; inwordpointer += 16) - * (out++) = ((*in) >> inwordpointer) % (1U << 16) ; - ++in; - } -} - - - - -void __fastpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) & 1 ; - ++in; - *out |= ((*in) & 1) << 1 ; - ++in; - *out |= ((*in) & 1) << 2 ; - ++in; - *out |= ((*in) & 1) << 3 ; - ++in; - *out |= ((*in) & 1) << 4 ; - ++in; - *out |= ((*in) & 1) << 5 ; - ++in; - *out |= ((*in) & 1) << 6 ; - ++in; - *out |= ((*in) & 1) << 7 ; - ++in; - *out |= ((*in) & 1) << 8 ; - ++in; - *out |= ((*in) & 1) << 9 ; - ++in; - *out |= ((*in) & 1) << 10 ; - ++in; - *out |= ((*in) & 1) << 11 ; - ++in; - *out |= ((*in) & 1) << 12 ; - ++in; - *out |= ((*in) & 1) << 13 ; - ++in; - *out |= ((*in) & 1) << 14 ; - ++in; - *out |= ((*in) & 1) << 15 ; - ++in; - *out |= ((*in) & 1) << 16 ; - ++in; - *out |= ((*in) & 1) << 17 ; - ++in; - *out |= ((*in) & 1) << 18 ; - ++in; - *out |= ((*in) & 1) << 19 ; - ++in; - *out |= ((*in) & 1) << 20 ; - ++in; - *out |= ((*in) & 1) << 21 ; - ++in; - *out |= ((*in) & 1) << 22 ; - ++in; - *out |= ((*in) & 1) << 23 ; - ++in; - *out |= ((*in) & 1) << 24 ; - ++in; - *out |= ((*in) & 1) << 25 ; - ++in; - *out |= ((*in) & 1) << 26 ; - ++in; - *out |= ((*in) & 1) << 27 ; - ++in; - *out |= ((*in) & 1) << 28 ; - ++in; - *out |= ((*in) & 1) << 29 ; - ++in; - *out |= ((*in) & 1) << 30 ; - ++in; - *out |= ((*in)) << 31 ; -} - - - - -void __fastpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 2) ; - ++in; - *out |= ((*in) % (1U << 2)) << 2 ; - ++in; - *out |= ((*in) % (1U << 2)) << 4 ; - ++in; - *out |= ((*in) % (1U << 2)) << 6 ; - ++in; - *out |= ((*in) % (1U << 2)) << 8 ; - ++in; - *out |= ((*in) % (1U << 2)) << 10 ; - ++in; - *out |= ((*in) % (1U << 2)) << 12 ; - ++in; - *out |= ((*in) % (1U << 2)) << 14 ; - ++in; - *out |= ((*in) % (1U << 2)) << 16 ; - ++in; - *out |= ((*in) % (1U << 2)) << 18 ; - ++in; - *out |= ((*in) % (1U << 2)) << 20 ; - ++in; - *out |= ((*in) % (1U << 2)) << 22 ; - ++in; - *out |= ((*in) % (1U << 2)) << 24 ; - ++in; - *out |= ((*in) % (1U << 2)) << 26 ; - ++in; - *out |= ((*in) % (1U << 2)) << 28 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - ++in; - *out = (*in) % (1U << 2) ; - ++in; - *out |= ((*in) % (1U << 2)) << 2 ; - ++in; - *out |= ((*in) % (1U << 2)) << 4 ; - ++in; - *out |= ((*in) % (1U << 2)) << 6 ; - ++in; - *out |= ((*in) % (1U << 2)) << 8 ; - ++in; - *out |= ((*in) % (1U << 2)) << 10 ; - ++in; - *out |= ((*in) % (1U << 2)) << 12 ; - ++in; - *out |= ((*in) % (1U << 2)) << 14 ; - ++in; - *out |= ((*in) % (1U << 2)) << 16 ; - ++in; - *out |= ((*in) % (1U << 2)) << 18 ; - ++in; - *out |= ((*in) % (1U << 2)) << 20 ; - ++in; - *out |= ((*in) % (1U << 2)) << 22 ; - ++in; - *out |= ((*in) % (1U << 2)) << 24 ; - ++in; - *out |= ((*in) % (1U << 2)) << 26 ; - ++in; - *out |= ((*in) % (1U << 2)) << 28 ; - ++in; - *out |= ((*in)) << 30 ; -} - - - - -void __fastpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 3) ; - ++in; - *out |= ((*in) % (1U << 3)) << 3 ; - ++in; - *out |= ((*in) % (1U << 3)) << 6 ; - ++in; - *out |= ((*in) % (1U << 3)) << 9 ; - ++in; - *out |= ((*in) % (1U << 3)) << 12 ; - ++in; - *out |= ((*in) % (1U << 3)) << 15 ; - ++in; - *out |= ((*in) % (1U << 3)) << 18 ; - ++in; - *out |= ((*in) % (1U << 3)) << 21 ; - ++in; - *out |= ((*in) % (1U << 3)) << 24 ; - ++in; - *out |= ((*in) % (1U << 3)) << 27 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 3)) >> (3 - 1); - ++in; - *out |= ((*in) % (1U << 3)) << 1 ; - ++in; - *out |= ((*in) % (1U << 3)) << 4 ; - ++in; - *out |= ((*in) % (1U << 3)) << 7 ; - ++in; - *out |= ((*in) % (1U << 3)) << 10 ; - ++in; - *out |= ((*in) % (1U << 3)) << 13 ; - ++in; - *out |= ((*in) % (1U << 3)) << 16 ; - ++in; - *out |= ((*in) % (1U << 3)) << 19 ; - ++in; - *out |= ((*in) % (1U << 3)) << 22 ; - ++in; - *out |= ((*in) % (1U << 3)) << 25 ; - ++in; - *out |= ((*in) % (1U << 3)) << 28 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 3)) >> (3 - 2); - ++in; - *out |= ((*in) % (1U << 3)) << 2 ; - ++in; - *out |= ((*in) % (1U << 3)) << 5 ; - ++in; - *out |= ((*in) % (1U << 3)) << 8 ; - ++in; - *out |= ((*in) % (1U << 3)) << 11 ; - ++in; - *out |= ((*in) % (1U << 3)) << 14 ; - ++in; - *out |= ((*in) % (1U << 3)) << 17 ; - ++in; - *out |= ((*in) % (1U << 3)) << 20 ; - ++in; - *out |= ((*in) % (1U << 3)) << 23 ; - ++in; - *out |= ((*in) % (1U << 3)) << 26 ; - ++in; - *out |= ((*in)) << 29 ; -} - - - - -void __fastpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 4) ; - ++in; - *out |= ((*in) % (1U << 4)) << 4 ; - ++in; - *out |= ((*in) % (1U << 4)) << 8 ; - ++in; - *out |= ((*in) % (1U << 4)) << 12 ; - ++in; - *out |= ((*in) % (1U << 4)) << 16 ; - ++in; - *out |= ((*in) % (1U << 4)) << 20 ; - ++in; - *out |= ((*in) % (1U << 4)) << 24 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - ++in; - *out = (*in) % (1U << 4) ; - ++in; - *out |= ((*in) % (1U << 4)) << 4 ; - ++in; - *out |= ((*in) % (1U << 4)) << 8 ; - ++in; - *out |= ((*in) % (1U << 4)) << 12 ; - ++in; - *out |= ((*in) % (1U << 4)) << 16 ; - ++in; - *out |= ((*in) % (1U << 4)) << 20 ; - ++in; - *out |= ((*in) % (1U << 4)) << 24 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - ++in; - *out = (*in) % (1U << 4) ; - ++in; - *out |= ((*in) % (1U << 4)) << 4 ; - ++in; - *out |= ((*in) % (1U << 4)) << 8 ; - ++in; - *out |= ((*in) % (1U << 4)) << 12 ; - ++in; - *out |= ((*in) % (1U << 4)) << 16 ; - ++in; - *out |= ((*in) % (1U << 4)) << 20 ; - ++in; - *out |= ((*in) % (1U << 4)) << 24 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - ++in; - *out = (*in) % (1U << 4) ; - ++in; - *out |= ((*in) % (1U << 4)) << 4 ; - ++in; - *out |= ((*in) % (1U << 4)) << 8 ; - ++in; - *out |= ((*in) % (1U << 4)) << 12 ; - ++in; - *out |= ((*in) % (1U << 4)) << 16 ; - ++in; - *out |= ((*in) % (1U << 4)) << 20 ; - ++in; - *out |= ((*in) % (1U << 4)) << 24 ; - ++in; - *out |= ((*in)) << 28 ; -} - - - - -void __fastpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 5) ; - ++in; - *out |= ((*in) % (1U << 5)) << 5 ; - ++in; - *out |= ((*in) % (1U << 5)) << 10 ; - ++in; - *out |= ((*in) % (1U << 5)) << 15 ; - ++in; - *out |= ((*in) % (1U << 5)) << 20 ; - ++in; - *out |= ((*in) % (1U << 5)) << 25 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 5)) >> (5 - 3); - ++in; - *out |= ((*in) % (1U << 5)) << 3 ; - ++in; - *out |= ((*in) % (1U << 5)) << 8 ; - ++in; - *out |= ((*in) % (1U << 5)) << 13 ; - ++in; - *out |= ((*in) % (1U << 5)) << 18 ; - ++in; - *out |= ((*in) % (1U << 5)) << 23 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 5)) >> (5 - 1); - ++in; - *out |= ((*in) % (1U << 5)) << 1 ; - ++in; - *out |= ((*in) % (1U << 5)) << 6 ; - ++in; - *out |= ((*in) % (1U << 5)) << 11 ; - ++in; - *out |= ((*in) % (1U << 5)) << 16 ; - ++in; - *out |= ((*in) % (1U << 5)) << 21 ; - ++in; - *out |= ((*in) % (1U << 5)) << 26 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 5)) >> (5 - 4); - ++in; - *out |= ((*in) % (1U << 5)) << 4 ; - ++in; - *out |= ((*in) % (1U << 5)) << 9 ; - ++in; - *out |= ((*in) % (1U << 5)) << 14 ; - ++in; - *out |= ((*in) % (1U << 5)) << 19 ; - ++in; - *out |= ((*in) % (1U << 5)) << 24 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 5)) >> (5 - 2); - ++in; - *out |= ((*in) % (1U << 5)) << 2 ; - ++in; - *out |= ((*in) % (1U << 5)) << 7 ; - ++in; - *out |= ((*in) % (1U << 5)) << 12 ; - ++in; - *out |= ((*in) % (1U << 5)) << 17 ; - ++in; - *out |= ((*in) % (1U << 5)) << 22 ; - ++in; - *out |= ((*in)) << 27 ; -} - - - - -void __fastpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 6) ; - ++in; - *out |= ((*in) % (1U << 6)) << 6 ; - ++in; - *out |= ((*in) % (1U << 6)) << 12 ; - ++in; - *out |= ((*in) % (1U << 6)) << 18 ; - ++in; - *out |= ((*in) % (1U << 6)) << 24 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 6)) >> (6 - 4); - ++in; - *out |= ((*in) % (1U << 6)) << 4 ; - ++in; - *out |= ((*in) % (1U << 6)) << 10 ; - ++in; - *out |= ((*in) % (1U << 6)) << 16 ; - ++in; - *out |= ((*in) % (1U << 6)) << 22 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 6)) >> (6 - 2); - ++in; - *out |= ((*in) % (1U << 6)) << 2 ; - ++in; - *out |= ((*in) % (1U << 6)) << 8 ; - ++in; - *out |= ((*in) % (1U << 6)) << 14 ; - ++in; - *out |= ((*in) % (1U << 6)) << 20 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - ++in; - *out = (*in) % (1U << 6) ; - ++in; - *out |= ((*in) % (1U << 6)) << 6 ; - ++in; - *out |= ((*in) % (1U << 6)) << 12 ; - ++in; - *out |= ((*in) % (1U << 6)) << 18 ; - ++in; - *out |= ((*in) % (1U << 6)) << 24 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 6)) >> (6 - 4); - ++in; - *out |= ((*in) % (1U << 6)) << 4 ; - ++in; - *out |= ((*in) % (1U << 6)) << 10 ; - ++in; - *out |= ((*in) % (1U << 6)) << 16 ; - ++in; - *out |= ((*in) % (1U << 6)) << 22 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 6)) >> (6 - 2); - ++in; - *out |= ((*in) % (1U << 6)) << 2 ; - ++in; - *out |= ((*in) % (1U << 6)) << 8 ; - ++in; - *out |= ((*in) % (1U << 6)) << 14 ; - ++in; - *out |= ((*in) % (1U << 6)) << 20 ; - ++in; - *out |= ((*in)) << 26 ; -} - - - - -void __fastpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 7) ; - ++in; - *out |= ((*in) % (1U << 7)) << 7 ; - ++in; - *out |= ((*in) % (1U << 7)) << 14 ; - ++in; - *out |= ((*in) % (1U << 7)) << 21 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 7)) >> (7 - 3); - ++in; - *out |= ((*in) % (1U << 7)) << 3 ; - ++in; - *out |= ((*in) % (1U << 7)) << 10 ; - ++in; - *out |= ((*in) % (1U << 7)) << 17 ; - ++in; - *out |= ((*in) % (1U << 7)) << 24 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 7)) >> (7 - 6); - ++in; - *out |= ((*in) % (1U << 7)) << 6 ; - ++in; - *out |= ((*in) % (1U << 7)) << 13 ; - ++in; - *out |= ((*in) % (1U << 7)) << 20 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 7)) >> (7 - 2); - ++in; - *out |= ((*in) % (1U << 7)) << 2 ; - ++in; - *out |= ((*in) % (1U << 7)) << 9 ; - ++in; - *out |= ((*in) % (1U << 7)) << 16 ; - ++in; - *out |= ((*in) % (1U << 7)) << 23 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 7)) >> (7 - 5); - ++in; - *out |= ((*in) % (1U << 7)) << 5 ; - ++in; - *out |= ((*in) % (1U << 7)) << 12 ; - ++in; - *out |= ((*in) % (1U << 7)) << 19 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 7)) >> (7 - 1); - ++in; - *out |= ((*in) % (1U << 7)) << 1 ; - ++in; - *out |= ((*in) % (1U << 7)) << 8 ; - ++in; - *out |= ((*in) % (1U << 7)) << 15 ; - ++in; - *out |= ((*in) % (1U << 7)) << 22 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 7)) >> (7 - 4); - ++in; - *out |= ((*in) % (1U << 7)) << 4 ; - ++in; - *out |= ((*in) % (1U << 7)) << 11 ; - ++in; - *out |= ((*in) % (1U << 7)) << 18 ; - ++in; - *out |= ((*in)) << 25 ; -} - - - - -void __fastpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 8) ; - ++in; - *out |= ((*in) % (1U << 8)) << 8 ; - ++in; - *out |= ((*in) % (1U << 8)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) % (1U << 8) ; - ++in; - *out |= ((*in) % (1U << 8)) << 8 ; - ++in; - *out |= ((*in) % (1U << 8)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) % (1U << 8) ; - ++in; - *out |= ((*in) % (1U << 8)) << 8 ; - ++in; - *out |= ((*in) % (1U << 8)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) % (1U << 8) ; - ++in; - *out |= ((*in) % (1U << 8)) << 8 ; - ++in; - *out |= ((*in) % (1U << 8)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) % (1U << 8) ; - ++in; - *out |= ((*in) % (1U << 8)) << 8 ; - ++in; - *out |= ((*in) % (1U << 8)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) % (1U << 8) ; - ++in; - *out |= ((*in) % (1U << 8)) << 8 ; - ++in; - *out |= ((*in) % (1U << 8)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) % (1U << 8) ; - ++in; - *out |= ((*in) % (1U << 8)) << 8 ; - ++in; - *out |= ((*in) % (1U << 8)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) % (1U << 8) ; - ++in; - *out |= ((*in) % (1U << 8)) << 8 ; - ++in; - *out |= ((*in) % (1U << 8)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; -} - - - - -void __fastpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 9) ; - ++in; - *out |= ((*in) % (1U << 9)) << 9 ; - ++in; - *out |= ((*in) % (1U << 9)) << 18 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 9)) >> (9 - 4); - ++in; - *out |= ((*in) % (1U << 9)) << 4 ; - ++in; - *out |= ((*in) % (1U << 9)) << 13 ; - ++in; - *out |= ((*in) % (1U << 9)) << 22 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 9)) >> (9 - 8); - ++in; - *out |= ((*in) % (1U << 9)) << 8 ; - ++in; - *out |= ((*in) % (1U << 9)) << 17 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 9)) >> (9 - 3); - ++in; - *out |= ((*in) % (1U << 9)) << 3 ; - ++in; - *out |= ((*in) % (1U << 9)) << 12 ; - ++in; - *out |= ((*in) % (1U << 9)) << 21 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 9)) >> (9 - 7); - ++in; - *out |= ((*in) % (1U << 9)) << 7 ; - ++in; - *out |= ((*in) % (1U << 9)) << 16 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 9)) >> (9 - 2); - ++in; - *out |= ((*in) % (1U << 9)) << 2 ; - ++in; - *out |= ((*in) % (1U << 9)) << 11 ; - ++in; - *out |= ((*in) % (1U << 9)) << 20 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 9)) >> (9 - 6); - ++in; - *out |= ((*in) % (1U << 9)) << 6 ; - ++in; - *out |= ((*in) % (1U << 9)) << 15 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 9)) >> (9 - 1); - ++in; - *out |= ((*in) % (1U << 9)) << 1 ; - ++in; - *out |= ((*in) % (1U << 9)) << 10 ; - ++in; - *out |= ((*in) % (1U << 9)) << 19 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 9)) >> (9 - 5); - ++in; - *out |= ((*in) % (1U << 9)) << 5 ; - ++in; - *out |= ((*in) % (1U << 9)) << 14 ; - ++in; - *out |= ((*in)) << 23 ; -} - - - - -void __fastpack10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 10) ; - ++in; - *out |= ((*in) % (1U << 10)) << 10 ; - ++in; - *out |= ((*in) % (1U << 10)) << 20 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 10)) >> (10 - 8); - ++in; - *out |= ((*in) % (1U << 10)) << 8 ; - ++in; - *out |= ((*in) % (1U << 10)) << 18 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 10)) >> (10 - 6); - ++in; - *out |= ((*in) % (1U << 10)) << 6 ; - ++in; - *out |= ((*in) % (1U << 10)) << 16 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 10)) >> (10 - 4); - ++in; - *out |= ((*in) % (1U << 10)) << 4 ; - ++in; - *out |= ((*in) % (1U << 10)) << 14 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 10)) >> (10 - 2); - ++in; - *out |= ((*in) % (1U << 10)) << 2 ; - ++in; - *out |= ((*in) % (1U << 10)) << 12 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - ++in; - *out = (*in) % (1U << 10) ; - ++in; - *out |= ((*in) % (1U << 10)) << 10 ; - ++in; - *out |= ((*in) % (1U << 10)) << 20 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 10)) >> (10 - 8); - ++in; - *out |= ((*in) % (1U << 10)) << 8 ; - ++in; - *out |= ((*in) % (1U << 10)) << 18 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 10)) >> (10 - 6); - ++in; - *out |= ((*in) % (1U << 10)) << 6 ; - ++in; - *out |= ((*in) % (1U << 10)) << 16 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 10)) >> (10 - 4); - ++in; - *out |= ((*in) % (1U << 10)) << 4 ; - ++in; - *out |= ((*in) % (1U << 10)) << 14 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 10)) >> (10 - 2); - ++in; - *out |= ((*in) % (1U << 10)) << 2 ; - ++in; - *out |= ((*in) % (1U << 10)) << 12 ; - ++in; - *out |= ((*in)) << 22 ; -} - - - - -void __fastpack11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 11) ; - ++in; - *out |= ((*in) % (1U << 11)) << 11 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 1); - ++in; - *out |= ((*in) % (1U << 11)) << 1 ; - ++in; - *out |= ((*in) % (1U << 11)) << 12 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 2); - ++in; - *out |= ((*in) % (1U << 11)) << 2 ; - ++in; - *out |= ((*in) % (1U << 11)) << 13 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 3); - ++in; - *out |= ((*in) % (1U << 11)) << 3 ; - ++in; - *out |= ((*in) % (1U << 11)) << 14 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 4); - ++in; - *out |= ((*in) % (1U << 11)) << 4 ; - ++in; - *out |= ((*in) % (1U << 11)) << 15 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 5); - ++in; - *out |= ((*in) % (1U << 11)) << 5 ; - ++in; - *out |= ((*in) % (1U << 11)) << 16 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 6); - ++in; - *out |= ((*in) % (1U << 11)) << 6 ; - ++in; - *out |= ((*in) % (1U << 11)) << 17 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 7); - ++in; - *out |= ((*in) % (1U << 11)) << 7 ; - ++in; - *out |= ((*in) % (1U << 11)) << 18 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 8); - ++in; - *out |= ((*in) % (1U << 11)) << 8 ; - ++in; - *out |= ((*in) % (1U << 11)) << 19 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 9); - ++in; - *out |= ((*in) % (1U << 11)) << 9 ; - ++in; - *out |= ((*in) % (1U << 11)) << 20 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 11)) >> (11 - 10); - ++in; - *out |= ((*in) % (1U << 11)) << 10 ; - ++in; - *out |= ((*in)) << 21 ; -} - - - - -void __fastpack12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 12) ; - ++in; - *out |= ((*in) % (1U << 12)) << 12 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 12)) >> (12 - 4); - ++in; - *out |= ((*in) % (1U << 12)) << 4 ; - ++in; - *out |= ((*in) % (1U << 12)) << 16 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 12)) >> (12 - 8); - ++in; - *out |= ((*in) % (1U << 12)) << 8 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - ++in; - *out = (*in) % (1U << 12) ; - ++in; - *out |= ((*in) % (1U << 12)) << 12 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 12)) >> (12 - 4); - ++in; - *out |= ((*in) % (1U << 12)) << 4 ; - ++in; - *out |= ((*in) % (1U << 12)) << 16 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 12)) >> (12 - 8); - ++in; - *out |= ((*in) % (1U << 12)) << 8 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - ++in; - *out = (*in) % (1U << 12) ; - ++in; - *out |= ((*in) % (1U << 12)) << 12 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 12)) >> (12 - 4); - ++in; - *out |= ((*in) % (1U << 12)) << 4 ; - ++in; - *out |= ((*in) % (1U << 12)) << 16 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 12)) >> (12 - 8); - ++in; - *out |= ((*in) % (1U << 12)) << 8 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - ++in; - *out = (*in) % (1U << 12) ; - ++in; - *out |= ((*in) % (1U << 12)) << 12 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 12)) >> (12 - 4); - ++in; - *out |= ((*in) % (1U << 12)) << 4 ; - ++in; - *out |= ((*in) % (1U << 12)) << 16 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 12)) >> (12 - 8); - ++in; - *out |= ((*in) % (1U << 12)) << 8 ; - ++in; - *out |= ((*in)) << 20 ; -} - - - - -void __fastpack13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 13) ; - ++in; - *out |= ((*in) % (1U << 13)) << 13 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 7); - ++in; - *out |= ((*in) % (1U << 13)) << 7 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 1); - ++in; - *out |= ((*in) % (1U << 13)) << 1 ; - ++in; - *out |= ((*in) % (1U << 13)) << 14 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 8); - ++in; - *out |= ((*in) % (1U << 13)) << 8 ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 2); - ++in; - *out |= ((*in) % (1U << 13)) << 2 ; - ++in; - *out |= ((*in) % (1U << 13)) << 15 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 9); - ++in; - *out |= ((*in) % (1U << 13)) << 9 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 3); - ++in; - *out |= ((*in) % (1U << 13)) << 3 ; - ++in; - *out |= ((*in) % (1U << 13)) << 16 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 10); - ++in; - *out |= ((*in) % (1U << 13)) << 10 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 4); - ++in; - *out |= ((*in) % (1U << 13)) << 4 ; - ++in; - *out |= ((*in) % (1U << 13)) << 17 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 11); - ++in; - *out |= ((*in) % (1U << 13)) << 11 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 5); - ++in; - *out |= ((*in) % (1U << 13)) << 5 ; - ++in; - *out |= ((*in) % (1U << 13)) << 18 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 12); - ++in; - *out |= ((*in) % (1U << 13)) << 12 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 13)) >> (13 - 6); - ++in; - *out |= ((*in) % (1U << 13)) << 6 ; - ++in; - *out |= ((*in)) << 19 ; -} - - - - -void __fastpack14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 14) ; - ++in; - *out |= ((*in) % (1U << 14)) << 14 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 10); - ++in; - *out |= ((*in) % (1U << 14)) << 10 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 6); - ++in; - *out |= ((*in) % (1U << 14)) << 6 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 2); - ++in; - *out |= ((*in) % (1U << 14)) << 2 ; - ++in; - *out |= ((*in) % (1U << 14)) << 16 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 12); - ++in; - *out |= ((*in) % (1U << 14)) << 12 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 8); - ++in; - *out |= ((*in) % (1U << 14)) << 8 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 4); - ++in; - *out |= ((*in) % (1U << 14)) << 4 ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - ++in; - *out = (*in) % (1U << 14) ; - ++in; - *out |= ((*in) % (1U << 14)) << 14 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 10); - ++in; - *out |= ((*in) % (1U << 14)) << 10 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 6); - ++in; - *out |= ((*in) % (1U << 14)) << 6 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 2); - ++in; - *out |= ((*in) % (1U << 14)) << 2 ; - ++in; - *out |= ((*in) % (1U << 14)) << 16 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 12); - ++in; - *out |= ((*in) % (1U << 14)) << 12 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 8); - ++in; - *out |= ((*in) % (1U << 14)) << 8 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 14)) >> (14 - 4); - ++in; - *out |= ((*in) % (1U << 14)) << 4 ; - ++in; - *out |= ((*in)) << 18 ; -} - - - - -void __fastpack15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 15) ; - ++in; - *out |= ((*in) % (1U << 15)) << 15 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 13); - ++in; - *out |= ((*in) % (1U << 15)) << 13 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 11); - ++in; - *out |= ((*in) % (1U << 15)) << 11 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 9); - ++in; - *out |= ((*in) % (1U << 15)) << 9 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 7); - ++in; - *out |= ((*in) % (1U << 15)) << 7 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 5); - ++in; - *out |= ((*in) % (1U << 15)) << 5 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 3); - ++in; - *out |= ((*in) % (1U << 15)) << 3 ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 1); - ++in; - *out |= ((*in) % (1U << 15)) << 1 ; - ++in; - *out |= ((*in) % (1U << 15)) << 16 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 14); - ++in; - *out |= ((*in) % (1U << 15)) << 14 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 12); - ++in; - *out |= ((*in) % (1U << 15)) << 12 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 10); - ++in; - *out |= ((*in) % (1U << 15)) << 10 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 8); - ++in; - *out |= ((*in) % (1U << 15)) << 8 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 6); - ++in; - *out |= ((*in) % (1U << 15)) << 6 ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 4); - ++in; - *out |= ((*in) % (1U << 15)) << 4 ; - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 15)) >> (15 - 2); - ++in; - *out |= ((*in) % (1U << 15)) << 2 ; - ++in; - *out |= ((*in)) << 17 ; -} - - - - -void __fastpack16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) % (1U << 16) ; - ++in; - *out |= ((*in)) << 16 ; -} - - - - -void __fastpack17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 17) ; - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 2); - ++in; - *out |= ((*in) % (1U << 17)) << 2 ; - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 4); - ++in; - *out |= ((*in) % (1U << 17)) << 4 ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 6); - ++in; - *out |= ((*in) % (1U << 17)) << 6 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 8); - ++in; - *out |= ((*in) % (1U << 17)) << 8 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 10); - ++in; - *out |= ((*in) % (1U << 17)) << 10 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 12); - ++in; - *out |= ((*in) % (1U << 17)) << 12 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 14); - ++in; - *out |= ((*in) % (1U << 17)) << 14 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 1); - ++in; - *out |= ((*in) % (1U << 17)) << 1 ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 3); - ++in; - *out |= ((*in) % (1U << 17)) << 3 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 5); - ++in; - *out |= ((*in) % (1U << 17)) << 5 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 7); - ++in; - *out |= ((*in) % (1U << 17)) << 7 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 9); - ++in; - *out |= ((*in) % (1U << 17)) << 9 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 11); - ++in; - *out |= ((*in) % (1U << 17)) << 11 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 13); - ++in; - *out |= ((*in) % (1U << 17)) << 13 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 17)) >> (17 - 15); - ++in; - *out |= ((*in)) << 15 ; -} - - - - -void __fastpack18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 18) ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 4); - ++in; - *out |= ((*in) % (1U << 18)) << 4 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 8); - ++in; - *out |= ((*in) % (1U << 18)) << 8 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 12); - ++in; - *out |= ((*in) % (1U << 18)) << 12 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 2); - ++in; - *out |= ((*in) % (1U << 18)) << 2 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 6); - ++in; - *out |= ((*in) % (1U << 18)) << 6 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 10); - ++in; - *out |= ((*in) % (1U << 18)) << 10 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - ++in; - *out = (*in) % (1U << 18) ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 4); - ++in; - *out |= ((*in) % (1U << 18)) << 4 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 8); - ++in; - *out |= ((*in) % (1U << 18)) << 8 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 12); - ++in; - *out |= ((*in) % (1U << 18)) << 12 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 2); - ++in; - *out |= ((*in) % (1U << 18)) << 2 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 6); - ++in; - *out |= ((*in) % (1U << 18)) << 6 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 10); - ++in; - *out |= ((*in) % (1U << 18)) << 10 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 18)) >> (18 - 14); - ++in; - *out |= ((*in)) << 14 ; -} - - - - -void __fastpack19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 19) ; - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 6); - ++in; - *out |= ((*in) % (1U << 19)) << 6 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 12); - ++in; - *out |= ((*in) % (1U << 19)) << 12 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 5); - ++in; - *out |= ((*in) % (1U << 19)) << 5 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 11); - ++in; - *out |= ((*in) % (1U << 19)) << 11 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 4); - ++in; - *out |= ((*in) % (1U << 19)) << 4 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 10); - ++in; - *out |= ((*in) % (1U << 19)) << 10 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 3); - ++in; - *out |= ((*in) % (1U << 19)) << 3 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 9); - ++in; - *out |= ((*in) % (1U << 19)) << 9 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 2); - ++in; - *out |= ((*in) % (1U << 19)) << 2 ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 8); - ++in; - *out |= ((*in) % (1U << 19)) << 8 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 1); - ++in; - *out |= ((*in) % (1U << 19)) << 1 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 7); - ++in; - *out |= ((*in) % (1U << 19)) << 7 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 19)) >> (19 - 13); - ++in; - *out |= ((*in)) << 13 ; -} - - - - -void __fastpack20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 20) ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 8); - ++in; - *out |= ((*in) % (1U << 20)) << 8 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 4); - ++in; - *out |= ((*in) % (1U << 20)) << 4 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - ++in; - *out = (*in) % (1U << 20) ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 8); - ++in; - *out |= ((*in) % (1U << 20)) << 8 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 4); - ++in; - *out |= ((*in) % (1U << 20)) << 4 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - ++in; - *out = (*in) % (1U << 20) ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 8); - ++in; - *out |= ((*in) % (1U << 20)) << 8 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 4); - ++in; - *out |= ((*in) % (1U << 20)) << 4 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - ++in; - *out = (*in) % (1U << 20) ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 8); - ++in; - *out |= ((*in) % (1U << 20)) << 8 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 4); - ++in; - *out |= ((*in) % (1U << 20)) << 4 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 20)) >> (20 - 12); - ++in; - *out |= ((*in)) << 12 ; -} - - - - -void __fastpack21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 21) ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 10); - ++in; - *out |= ((*in) % (1U << 21)) << 10 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 9); - ++in; - *out |= ((*in) % (1U << 21)) << 9 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 8); - ++in; - *out |= ((*in) % (1U << 21)) << 8 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 7); - ++in; - *out |= ((*in) % (1U << 21)) << 7 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 6); - ++in; - *out |= ((*in) % (1U << 21)) << 6 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 5); - ++in; - *out |= ((*in) % (1U << 21)) << 5 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 4); - ++in; - *out |= ((*in) % (1U << 21)) << 4 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 3); - ++in; - *out |= ((*in) % (1U << 21)) << 3 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 2); - ++in; - *out |= ((*in) % (1U << 21)) << 2 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 1); - ++in; - *out |= ((*in) % (1U << 21)) << 1 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 21)) >> (21 - 11); - ++in; - *out |= ((*in)) << 11 ; -} - - - - -void __fastpack22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 22) ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 2); - ++in; - *out |= ((*in) % (1U << 22)) << 2 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 4); - ++in; - *out |= ((*in) % (1U << 22)) << 4 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 6); - ++in; - *out |= ((*in) % (1U << 22)) << 6 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 8); - ++in; - *out |= ((*in) % (1U << 22)) << 8 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - ++in; - *out = (*in) % (1U << 22) ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 2); - ++in; - *out |= ((*in) % (1U << 22)) << 2 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 4); - ++in; - *out |= ((*in) % (1U << 22)) << 4 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 6); - ++in; - *out |= ((*in) % (1U << 22)) << 6 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 8); - ++in; - *out |= ((*in) % (1U << 22)) << 8 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 22)) >> (22 - 10); - ++in; - *out |= ((*in)) << 10 ; -} - - - - -void __fastpack23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 23) ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 5); - ++in; - *out |= ((*in) % (1U << 23)) << 5 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 1); - ++in; - *out |= ((*in) % (1U << 23)) << 1 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 6); - ++in; - *out |= ((*in) % (1U << 23)) << 6 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 2); - ++in; - *out |= ((*in) % (1U << 23)) << 2 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 7); - ++in; - *out |= ((*in) % (1U << 23)) << 7 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 3); - ++in; - *out |= ((*in) % (1U << 23)) << 3 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 8); - ++in; - *out |= ((*in) % (1U << 23)) << 8 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 4); - ++in; - *out |= ((*in) % (1U << 23)) << 4 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 23)) >> (23 - 9); - ++in; - *out |= ((*in)) << 9 ; -} - - - - -void __fastpack24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 24) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) % (1U << 24) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) % (1U << 24) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) % (1U << 24) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) % (1U << 24) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) % (1U << 24) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) % (1U << 24) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) % (1U << 24) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; -} - - - - -void __fastpack25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 25) ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 4); - ++in; - *out |= ((*in) % (1U << 25)) << 4 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 1); - ++in; - *out |= ((*in) % (1U << 25)) << 1 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 5); - ++in; - *out |= ((*in) % (1U << 25)) << 5 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 23); - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 2); - ++in; - *out |= ((*in) % (1U << 25)) << 2 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 6); - ++in; - *out |= ((*in) % (1U << 25)) << 6 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 3); - ++in; - *out |= ((*in) % (1U << 25)) << 3 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 25)) >> (25 - 7); - ++in; - *out |= ((*in)) << 7 ; -} - - - - -void __fastpack26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 26) ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 2); - ++in; - *out |= ((*in) % (1U << 26)) << 2 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 4); - ++in; - *out |= ((*in) % (1U << 26)) << 4 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - ++in; - *out = (*in) % (1U << 26) ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 2); - ++in; - *out |= ((*in) % (1U << 26)) << 2 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 4); - ++in; - *out |= ((*in) % (1U << 26)) << 4 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 26)) >> (26 - 6); - ++in; - *out |= ((*in)) << 6 ; -} - - - - -void __fastpack27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 27) ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 2); - ++in; - *out |= ((*in) % (1U << 27)) << 2 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 4); - ++in; - *out |= ((*in) % (1U << 27)) << 4 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 1); - ++in; - *out |= ((*in) % (1U << 27)) << 1 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 23); - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 3); - ++in; - *out |= ((*in) % (1U << 27)) << 3 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 25); - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 27)) >> (27 - 5); - ++in; - *out |= ((*in)) << 5 ; -} - - - - -void __fastpack28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 28) ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - ++in; - *out = (*in) % (1U << 28) ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - ++in; - *out = (*in) % (1U << 28) ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - ++in; - *out = (*in) % (1U << 28) ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 28)) >> (28 - 4); - ++in; - *out |= ((*in)) << 4 ; -} - - - - -void __fastpack29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 29) ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 23); - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 2); - ++in; - *out |= ((*in) % (1U << 29)) << 2 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 28); - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 25); - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 1); - ++in; - *out |= ((*in) % (1U << 29)) << 1 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 27); - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in) % (1U << 29)) >> (29 - 3); - ++in; - *out |= ((*in)) << 3 ; -} - - - - -void __fastpack30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 30) ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 28); - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++out; - ++in; - *out = (*in) % (1U << 30) ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 28); - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - *out = ((*in) % (1U << 30)) >> (30 - 2); - ++in; - *out |= ((*in)) << 2 ; -} - - - - -void __fastpack31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) % (1U << 31) ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 30); - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 29); - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 28); - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 27); - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 25); - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 23); - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++out; - *out = ((*in) % (1U << 31)) >> (31 - 1); - ++in; - *out |= ((*in)) << 1 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 15 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 17 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 19 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 21 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 23 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 25 ; - ++in; - *out |= ((*in)) << 26 ; - ++in; - *out |= ((*in)) << 27 ; - ++in; - *out |= ((*in)) << 28 ; - ++in; - *out |= ((*in)) << 29 ; - ++in; - *out |= ((*in)) << 30 ; - ++in; - *out |= ((*in)) << 31 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 26 ; - ++in; - *out |= ((*in)) << 28 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 26 ; - ++in; - *out |= ((*in)) << 28 ; - ++in; - *out |= ((*in)) << 30 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 15 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 21 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 27 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (3 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 19 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 25 ; - ++in; - *out |= ((*in)) << 28 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (3 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 17 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 23 ; - ++in; - *out |= ((*in)) << 26 ; - ++in; - *out |= ((*in)) << 29 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 28 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 15 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 25 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (5 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 23 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (5 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 21 ; - ++in; - *out |= ((*in)) << 26 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (5 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 19 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (5 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 17 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 27 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (6 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (6 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (6 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (6 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 26 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 21 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (7 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 17 ; - ++in; - *out |= ((*in)) << 24 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (7 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (7 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 23 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (7 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 19 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (7 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 15 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (7 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 25 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 24 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (9 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 22 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (9 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 17 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (9 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 21 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (9 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (9 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (9 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 15 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (9 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 19 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (9 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 23 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (10 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (10 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (10 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (10 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (10 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (10 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (10 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (10 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 22 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (11 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (11 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (11 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (11 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 15 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (11 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (11 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 17 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (11 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (11 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 19 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (11 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 20 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (11 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 21 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (12 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (12 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (12 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (12 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (12 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (12 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (12 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (12 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 20 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (13 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (13 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (13 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (13 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 15 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (13 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (13 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (13 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (13 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 17 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (13 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (13 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 18 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (13 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (13 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 19 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (14 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (14 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (14 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (14 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (14 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (14 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (14 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (14 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (14 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (14 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (14 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (14 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 18 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 15 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (15 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (15 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (15 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (15 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (15 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (15 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (15 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 16 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (15 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (15 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (15 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (15 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (15 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (15 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (15 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 17 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 16 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in)) >> (17 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (17 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (17 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (17 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (17 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (17 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (17 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (17 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (17 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (17 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (17 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (17 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (17 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (17 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (17 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (17 - 15); - ++in; - *out |= ((*in)) << 15 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (18 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (18 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (18 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (18 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (18 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (18 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (18 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (18 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (18 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (18 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (18 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (18 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (18 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (18 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (18 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (18 - 14); - ++in; - *out |= ((*in)) << 14 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (19 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (19 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (19 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (19 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (19 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (19 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in)) >> (19 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (19 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (19 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (19 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (19 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (19 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in)) >> (19 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (19 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (19 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (19 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (19 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (19 - 13); - ++in; - *out |= ((*in)) << 13 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (20 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (20 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (20 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (20 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (20 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (20 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (20 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (20 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (20 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (20 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (20 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (20 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (20 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (20 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (20 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (20 - 12); - ++in; - *out |= ((*in)) << 12 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (21 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (21 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (21 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (21 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (21 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (21 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (21 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (21 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in)) >> (21 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (21 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (21 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (21 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in)) >> (21 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (21 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (21 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (21 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in)) >> (21 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (21 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (21 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (21 - 11); - ++in; - *out |= ((*in)) << 11 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (22 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (22 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (22 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (22 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (22 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (22 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (22 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (22 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (22 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (22 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (22 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (22 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (22 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (22 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (22 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (22 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (22 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (22 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (22 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (22 - 10); - ++in; - *out |= ((*in)) << 10 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (23 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (23 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (23 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (23 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (23 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (23 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in)) >> (23 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (23 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (23 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in)) >> (23 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (23 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (23 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (23 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (23 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (23 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (23 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in)) >> (23 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (23 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (23 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in)) >> (23 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (23 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (23 - 9); - ++in; - *out |= ((*in)) << 9 ; -} - - - -/*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (24 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (24 - 8); - ++in; - *out |= ((*in)) << 8 ; +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask21(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (21 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (21 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (21 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (21 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (21 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (21 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (21 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (21 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (21 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (21 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (21 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (21 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (21 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (21 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (21 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (21 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in)) >> (21 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (21 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (21 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (21 - 11); + ++in; + *out |= ((*in)) << 11; } - - /*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (25 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (25 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in)) >> (25 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (25 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (25 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in)) >> (25 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (25 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (25 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (25 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (25 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (25 - 23); - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (25 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (25 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++out; - *out = ((*in)) >> (25 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (25 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (25 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in)) >> (25 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (25 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (25 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in)) >> (25 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (25 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (25 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (25 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (25 - 7); - ++in; - *out |= ((*in)) << 7 ; +void __fastpackwithoutmask22(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (22 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (22 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (22 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (22 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (22 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (22 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (22 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (22 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (22 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (22 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (22 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (22 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (22 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (22 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (22 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (22 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (22 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (22 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (22 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (22 - 10); + ++in; + *out |= ((*in)) << 10; } - - /*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (26 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (26 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (26 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (26 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (26 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (26 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (26 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (26 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (26 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (26 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (26 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (26 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (26 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (26 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (26 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (26 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (26 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (26 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (26 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (26 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (26 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (26 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (26 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (26 - 6); - ++in; - *out |= ((*in)) << 6 ; +void __fastpackwithoutmask23(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (23 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (23 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (23 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (23 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (23 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (23 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (23 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (23 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (23 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (23 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (23 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (23 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (23 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (23 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (23 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (23 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (23 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (23 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (23 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in)) >> (23 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (23 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (23 - 9); + ++in; + *out |= ((*in)) << 9; } - - /*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (27 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (27 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in)) >> (27 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (27 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++out; - *out = ((*in)) >> (27 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (27 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (27 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (27 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (27 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++out; - *out = ((*in)) >> (27 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (27 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (27 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (27 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (27 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in)) >> (27 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in)) >> (27 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (27 - 23); - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (27 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (27 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in)) >> (27 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (27 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (27 - 25); - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (27 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (27 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in)) >> (27 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (27 - 5); - ++in; - *out |= ((*in)) << 5 ; +void __fastpackwithoutmask24(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; } - - /*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (28 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (28 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (28 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (28 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (28 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (28 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (28 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (28 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (28 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (28 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (28 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (28 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (28 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (28 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (28 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (28 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (28 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (28 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (28 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (28 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (28 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (28 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (28 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (28 - 4); - ++in; - *out |= ((*in)) << 4 ; +void __fastpackwithoutmask25(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (25 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (25 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (25 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (25 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (25 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (25 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (25 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (25 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (25 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (25 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (25 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (25 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (25 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in)) >> (25 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (25 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (25 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in)) >> (25 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (25 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (25 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (25 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (25 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (25 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (25 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (25 - 7); + ++in; + *out |= ((*in)) << 7; } - - /*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (29 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (29 - 23); - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (29 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (29 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in)) >> (29 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (29 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in)) >> (29 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (29 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++out; - *out = ((*in)) >> (29 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (29 - 28); - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (29 - 25); - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (29 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (29 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (29 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (29 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in)) >> (29 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (29 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++out; - *out = ((*in)) >> (29 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - *out = ((*in)) >> (29 - 1); - ++in; - *out |= ((*in)) << 1 ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (29 - 27); - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (29 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (29 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (29 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (29 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in)) >> (29 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (29 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++out; - *out = ((*in)) >> (29 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in)) >> (29 - 3); - ++in; - *out |= ((*in)) << 3 ; +void __fastpackwithoutmask26(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (26 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (26 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (26 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (26 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (26 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (26 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (26 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (26 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (26 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (26 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (26 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (26 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (26 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (26 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (26 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (26 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (26 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (26 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (26 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (26 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (26 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (26 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (26 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (26 - 6); + ++in; + *out |= ((*in)) << 6; } - - /*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (30 - 28); - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (30 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (30 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (30 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (30 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (30 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (30 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (30 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (30 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (30 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (30 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (30 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in)) >> (30 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - *out = ((*in)) >> (30 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (30 - 28); - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (30 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (30 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (30 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (30 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (30 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (30 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (30 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (30 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (30 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (30 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (30 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in)) >> (30 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - *out = ((*in)) >> (30 - 2); - ++in; - *out |= ((*in)) << 2 ; +void __fastpackwithoutmask27(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (27 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (27 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (27 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (27 - 7); + ++in; + *out |= ((*in)) << 7; + ++out; + *out = ((*in)) >> (27 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (27 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (27 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (27 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (27 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in)) >> (27 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (27 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (27 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (27 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (27 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (27 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in)) >> (27 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (27 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (27 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (27 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in)) >> (27 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (27 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (27 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (27 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (27 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (27 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (27 - 5); + ++in; + *out |= ((*in)) << 5; } - - /*assumes that integers fit in the prescribed number of bits */ -void __fastpackwithoutmask31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - *out = (*in) ; - ++in; - *out |= ((*in)) << 31 ; - ++out; - *out = ((*in)) >> (31 - 30); - ++in; - *out |= ((*in)) << 30 ; - ++out; - *out = ((*in)) >> (31 - 29); - ++in; - *out |= ((*in)) << 29 ; - ++out; - *out = ((*in)) >> (31 - 28); - ++in; - *out |= ((*in)) << 28 ; - ++out; - *out = ((*in)) >> (31 - 27); - ++in; - *out |= ((*in)) << 27 ; - ++out; - *out = ((*in)) >> (31 - 26); - ++in; - *out |= ((*in)) << 26 ; - ++out; - *out = ((*in)) >> (31 - 25); - ++in; - *out |= ((*in)) << 25 ; - ++out; - *out = ((*in)) >> (31 - 24); - ++in; - *out |= ((*in)) << 24 ; - ++out; - *out = ((*in)) >> (31 - 23); - ++in; - *out |= ((*in)) << 23 ; - ++out; - *out = ((*in)) >> (31 - 22); - ++in; - *out |= ((*in)) << 22 ; - ++out; - *out = ((*in)) >> (31 - 21); - ++in; - *out |= ((*in)) << 21 ; - ++out; - *out = ((*in)) >> (31 - 20); - ++in; - *out |= ((*in)) << 20 ; - ++out; - *out = ((*in)) >> (31 - 19); - ++in; - *out |= ((*in)) << 19 ; - ++out; - *out = ((*in)) >> (31 - 18); - ++in; - *out |= ((*in)) << 18 ; - ++out; - *out = ((*in)) >> (31 - 17); - ++in; - *out |= ((*in)) << 17 ; - ++out; - *out = ((*in)) >> (31 - 16); - ++in; - *out |= ((*in)) << 16 ; - ++out; - *out = ((*in)) >> (31 - 15); - ++in; - *out |= ((*in)) << 15 ; - ++out; - *out = ((*in)) >> (31 - 14); - ++in; - *out |= ((*in)) << 14 ; - ++out; - *out = ((*in)) >> (31 - 13); - ++in; - *out |= ((*in)) << 13 ; - ++out; - *out = ((*in)) >> (31 - 12); - ++in; - *out |= ((*in)) << 12 ; - ++out; - *out = ((*in)) >> (31 - 11); - ++in; - *out |= ((*in)) << 11 ; - ++out; - *out = ((*in)) >> (31 - 10); - ++in; - *out |= ((*in)) << 10 ; - ++out; - *out = ((*in)) >> (31 - 9); - ++in; - *out |= ((*in)) << 9 ; - ++out; - *out = ((*in)) >> (31 - 8); - ++in; - *out |= ((*in)) << 8 ; - ++out; - *out = ((*in)) >> (31 - 7); - ++in; - *out |= ((*in)) << 7 ; - ++out; - *out = ((*in)) >> (31 - 6); - ++in; - *out |= ((*in)) << 6 ; - ++out; - *out = ((*in)) >> (31 - 5); - ++in; - *out |= ((*in)) << 5 ; - ++out; - *out = ((*in)) >> (31 - 4); - ++in; - *out |= ((*in)) << 4 ; - ++out; - *out = ((*in)) >> (31 - 3); - ++in; - *out |= ((*in)) << 3 ; - ++out; - *out = ((*in)) >> (31 - 2); - ++in; - *out |= ((*in)) << 2 ; - ++out; - *out = ((*in)) >> (31 - 1); - ++in; - *out |= ((*in)) << 1 ; +void __fastpackwithoutmask28(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; } - - -void fastunpack(const uint32_t *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - // Could have used function pointers instead of switch. - // Switch calls do offer the compiler more opportunities for optimization in - // theory. In this case, it makes no difference with a good compiler. - switch (bit) { - case 0: - __fastunpack0(in, out); - break; - case 1: - __fastunpack1(in, out); - break; - case 2: - __fastunpack2(in, out); - break; - case 3: - __fastunpack3(in, out); - break; - case 4: - __fastunpack4(in, out); - break; - case 5: - __fastunpack5(in, out); - break; - case 6: - __fastunpack6(in, out); - break; - case 7: - __fastunpack7(in, out); - break; - case 8: - __fastunpack8(in, out); - break; - case 9: - __fastunpack9(in, out); - break; - case 10: - __fastunpack10(in, out); - break; - case 11: - __fastunpack11(in, out); - break; - case 12: - __fastunpack12(in, out); - break; - case 13: - __fastunpack13(in, out); - break; - case 14: - __fastunpack14(in, out); - break; - case 15: - __fastunpack15(in, out); - break; - case 16: - __fastunpack16(in, out); - break; - case 17: - __fastunpack17(in, out); - break; - case 18: - __fastunpack18(in, out); - break; - case 19: - __fastunpack19(in, out); - break; - case 20: - __fastunpack20(in, out); - break; - case 21: - __fastunpack21(in, out); - break; - case 22: - __fastunpack22(in, out); - break; - case 23: - __fastunpack23(in, out); - break; - case 24: - __fastunpack24(in, out); - break; - case 25: - __fastunpack25(in, out); - break; - case 26: - __fastunpack26(in, out); - break; - case 27: - __fastunpack27(in, out); - break; - case 28: - __fastunpack28(in, out); - break; - case 29: - __fastunpack29(in, out); - break; - case 30: - __fastunpack30(in, out); - break; - case 31: - __fastunpack31(in, out); - break; - case 32: - __fastunpack32(in, out); - break; - default: - break; - } +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask29(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (29 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (29 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (29 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (29 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (29 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (29 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (29 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (29 - 5); + ++in; + *out |= ((*in)) << 5; + ++out; + *out = ((*in)) >> (29 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (29 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (29 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (29 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (29 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (29 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (29 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in)) >> (29 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (29 - 7); + ++in; + *out |= ((*in)) << 7; + ++out; + *out = ((*in)) >> (29 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in)) >> (29 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (29 - 27); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (29 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (29 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (29 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (29 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (29 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (29 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in)) >> (29 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in)) >> (29 - 3); + ++in; + *out |= ((*in)) << 3; } +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask30(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (30 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (30 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (30 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (30 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (30 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (30 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (30 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (30 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (30 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (30 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (30 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (30 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in)) >> (30 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in)) >> (30 - 2); + ++in; + *out |= ((*in)) << 2; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (30 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (30 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (30 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (30 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (30 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (30 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (30 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (30 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (30 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (30 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (30 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (30 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in)) >> (30 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in)) >> (30 - 2); + ++in; + *out |= ((*in)) << 2; +} - -void fastpack(const uint32_t *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - // Could have used function pointers instead of switch. - // Switch calls do offer the compiler more opportunities for optimization in - // theory. In this case, it makes no difference with a good compiler. - switch (bit) { - case 0: - __fastpack0(in, out); - break; - case 1: - __fastpack1(in, out); - break; - case 2: - __fastpack2(in, out); - break; - case 3: - __fastpack3(in, out); - break; - case 4: - __fastpack4(in, out); - break; - case 5: - __fastpack5(in, out); - break; - case 6: - __fastpack6(in, out); - break; - case 7: - __fastpack7(in, out); - break; - case 8: - __fastpack8(in, out); - break; - case 9: - __fastpack9(in, out); - break; - case 10: - __fastpack10(in, out); - break; - case 11: - __fastpack11(in, out); - break; - case 12: - __fastpack12(in, out); - break; - case 13: - __fastpack13(in, out); - break; - case 14: - __fastpack14(in, out); - break; - case 15: - __fastpack15(in, out); - break; - case 16: - __fastpack16(in, out); - break; - case 17: - __fastpack17(in, out); - break; - case 18: - __fastpack18(in, out); - break; - case 19: - __fastpack19(in, out); - break; - case 20: - __fastpack20(in, out); - break; - case 21: - __fastpack21(in, out); - break; - case 22: - __fastpack22(in, out); - break; - case 23: - __fastpack23(in, out); - break; - case 24: - __fastpack24(in, out); - break; - case 25: - __fastpack25(in, out); - break; - case 26: - __fastpack26(in, out); - break; - case 27: - __fastpack27(in, out); - break; - case 28: - __fastpack28(in, out); - break; - case 29: - __fastpack29(in, out); - break; - case 30: - __fastpack30(in, out); - break; - case 31: - __fastpack31(in, out); - break; - case 32: - __fastpack32(in, out); - break; - default: - break; - } +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask31(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in); + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (31 - 30); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (31 - 29); + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (31 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (31 - 27); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (31 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (31 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (31 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (31 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (31 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (31 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (31 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (31 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (31 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (31 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (31 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (31 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (31 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (31 - 13); + ++in; + *out |= ((*in)) << 13; + ++out; + *out = ((*in)) >> (31 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (31 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (31 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (31 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in)) >> (31 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (31 - 7); + ++in; + *out |= ((*in)) << 7; + ++out; + *out = ((*in)) >> (31 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in)) >> (31 - 5); + ++in; + *out |= ((*in)) << 5; + ++out; + *out = ((*in)) >> (31 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in)) >> (31 - 3); + ++in; + *out |= ((*in)) << 3; + ++out; + *out = ((*in)) >> (31 - 2); + ++in; + *out |= ((*in)) << 2; + ++out; + *out = ((*in)) >> (31 - 1); + ++in; + *out |= ((*in)) << 1; } +void fastunpack(const uint32_t *__restrict__ in, uint32_t *__restrict__ out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastunpack0(in, out); + break; + case 1: + __fastunpack1(in, out); + break; + case 2: + __fastunpack2(in, out); + break; + case 3: + __fastunpack3(in, out); + break; + case 4: + __fastunpack4(in, out); + break; + case 5: + __fastunpack5(in, out); + break; + case 6: + __fastunpack6(in, out); + break; + case 7: + __fastunpack7(in, out); + break; + case 8: + __fastunpack8(in, out); + break; + case 9: + __fastunpack9(in, out); + break; + case 10: + __fastunpack10(in, out); + break; + case 11: + __fastunpack11(in, out); + break; + case 12: + __fastunpack12(in, out); + break; + case 13: + __fastunpack13(in, out); + break; + case 14: + __fastunpack14(in, out); + break; + case 15: + __fastunpack15(in, out); + break; + case 16: + __fastunpack16(in, out); + break; + case 17: + __fastunpack17(in, out); + break; + case 18: + __fastunpack18(in, out); + break; + case 19: + __fastunpack19(in, out); + break; + case 20: + __fastunpack20(in, out); + break; + case 21: + __fastunpack21(in, out); + break; + case 22: + __fastunpack22(in, out); + break; + case 23: + __fastunpack23(in, out); + break; + case 24: + __fastunpack24(in, out); + break; + case 25: + __fastunpack25(in, out); + break; + case 26: + __fastunpack26(in, out); + break; + case 27: + __fastunpack27(in, out); + break; + case 28: + __fastunpack28(in, out); + break; + case 29: + __fastunpack29(in, out); + break; + case 30: + __fastunpack30(in, out); + break; + case 31: + __fastunpack31(in, out); + break; + case 32: + __fastunpack32(in, out); + break; + default: + break; + } +} +void fastpack(const uint32_t *__restrict__ in, uint32_t *__restrict__ out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastpack0(in, out); + break; + case 1: + __fastpack1(in, out); + break; + case 2: + __fastpack2(in, out); + break; + case 3: + __fastpack3(in, out); + break; + case 4: + __fastpack4(in, out); + break; + case 5: + __fastpack5(in, out); + break; + case 6: + __fastpack6(in, out); + break; + case 7: + __fastpack7(in, out); + break; + case 8: + __fastpack8(in, out); + break; + case 9: + __fastpack9(in, out); + break; + case 10: + __fastpack10(in, out); + break; + case 11: + __fastpack11(in, out); + break; + case 12: + __fastpack12(in, out); + break; + case 13: + __fastpack13(in, out); + break; + case 14: + __fastpack14(in, out); + break; + case 15: + __fastpack15(in, out); + break; + case 16: + __fastpack16(in, out); + break; + case 17: + __fastpack17(in, out); + break; + case 18: + __fastpack18(in, out); + break; + case 19: + __fastpack19(in, out); + break; + case 20: + __fastpack20(in, out); + break; + case 21: + __fastpack21(in, out); + break; + case 22: + __fastpack22(in, out); + break; + case 23: + __fastpack23(in, out); + break; + case 24: + __fastpack24(in, out); + break; + case 25: + __fastpack25(in, out); + break; + case 26: + __fastpack26(in, out); + break; + case 27: + __fastpack27(in, out); + break; + case 28: + __fastpack28(in, out); + break; + case 29: + __fastpack29(in, out); + break; + case 30: + __fastpack30(in, out); + break; + case 31: + __fastpack31(in, out); + break; + case 32: + __fastpack32(in, out); + break; + default: + break; + } +} /*assumes that integers fit in the prescribed number of bits*/ -void fastpackwithoutmask(const uint32_t *__restrict__ in, uint32_t *__restrict__ out, const uint32_t bit) { - // Could have used function pointers instead of switch. - // Switch calls do offer the compiler more opportunities for optimization in - // theory. In this case, it makes no difference with a good compiler. - switch (bit) { - case 0: - __fastpackwithoutmask0(in, out); - break; - case 1: - __fastpackwithoutmask1(in, out); - break; - case 2: - __fastpackwithoutmask2(in, out); - break; - case 3: - __fastpackwithoutmask3(in, out); - break; - case 4: - __fastpackwithoutmask4(in, out); - break; - case 5: - __fastpackwithoutmask5(in, out); - break; - case 6: - __fastpackwithoutmask6(in, out); - break; - case 7: - __fastpackwithoutmask7(in, out); - break; - case 8: - __fastpackwithoutmask8(in, out); - break; - case 9: - __fastpackwithoutmask9(in, out); - break; - case 10: - __fastpackwithoutmask10(in, out); - break; - case 11: - __fastpackwithoutmask11(in, out); - break; - case 12: - __fastpackwithoutmask12(in, out); - break; - case 13: - __fastpackwithoutmask13(in, out); - break; - case 14: - __fastpackwithoutmask14(in, out); - break; - case 15: - __fastpackwithoutmask15(in, out); - break; - case 16: - __fastpackwithoutmask16(in, out); - break; - case 17: - __fastpackwithoutmask17(in, out); - break; - case 18: - __fastpackwithoutmask18(in, out); - break; - case 19: - __fastpackwithoutmask19(in, out); - break; - case 20: - __fastpackwithoutmask20(in, out); - break; - case 21: - __fastpackwithoutmask21(in, out); - break; - case 22: - __fastpackwithoutmask22(in, out); - break; - case 23: - __fastpackwithoutmask23(in, out); - break; - case 24: - __fastpackwithoutmask24(in, out); - break; - case 25: - __fastpackwithoutmask25(in, out); - break; - case 26: - __fastpackwithoutmask26(in, out); - break; - case 27: - __fastpackwithoutmask27(in, out); - break; - case 28: - __fastpackwithoutmask28(in, out); - break; - case 29: - __fastpackwithoutmask29(in, out); - break; - case 30: - __fastpackwithoutmask30(in, out); - break; - case 31: - __fastpackwithoutmask31(in, out); - break; - case 32: - __fastpackwithoutmask32(in, out); - break; - default: - break; - } +void fastpackwithoutmask(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastpackwithoutmask0(in, out); + break; + case 1: + __fastpackwithoutmask1(in, out); + break; + case 2: + __fastpackwithoutmask2(in, out); + break; + case 3: + __fastpackwithoutmask3(in, out); + break; + case 4: + __fastpackwithoutmask4(in, out); + break; + case 5: + __fastpackwithoutmask5(in, out); + break; + case 6: + __fastpackwithoutmask6(in, out); + break; + case 7: + __fastpackwithoutmask7(in, out); + break; + case 8: + __fastpackwithoutmask8(in, out); + break; + case 9: + __fastpackwithoutmask9(in, out); + break; + case 10: + __fastpackwithoutmask10(in, out); + break; + case 11: + __fastpackwithoutmask11(in, out); + break; + case 12: + __fastpackwithoutmask12(in, out); + break; + case 13: + __fastpackwithoutmask13(in, out); + break; + case 14: + __fastpackwithoutmask14(in, out); + break; + case 15: + __fastpackwithoutmask15(in, out); + break; + case 16: + __fastpackwithoutmask16(in, out); + break; + case 17: + __fastpackwithoutmask17(in, out); + break; + case 18: + __fastpackwithoutmask18(in, out); + break; + case 19: + __fastpackwithoutmask19(in, out); + break; + case 20: + __fastpackwithoutmask20(in, out); + break; + case 21: + __fastpackwithoutmask21(in, out); + break; + case 22: + __fastpackwithoutmask22(in, out); + break; + case 23: + __fastpackwithoutmask23(in, out); + break; + case 24: + __fastpackwithoutmask24(in, out); + break; + case 25: + __fastpackwithoutmask25(in, out); + break; + case 26: + __fastpackwithoutmask26(in, out); + break; + case 27: + __fastpackwithoutmask27(in, out); + break; + case 28: + __fastpackwithoutmask28(in, out); + break; + case 29: + __fastpackwithoutmask29(in, out); + break; + case 30: + __fastpackwithoutmask30(in, out); + break; + case 31: + __fastpackwithoutmask31(in, out); + break; + case 32: + __fastpackwithoutmask32(in, out); + break; + default: + break; + } } } // namespace SIMDCompressionLib diff --git a/src/for-gen.c b/src/for-gen.c index 997a871..ac98c79 100644 --- a/src/for-gen.c +++ b/src/for-gen.c @@ -3,16 +3,14 @@ * The pack/unpack routines will not work on big-endian architectures. */ -static uint32_t -pack0_n(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack0_n(uint32_t base, const uint32_t *in, uint8_t *out) { (void)base; (void)in; (void)out; return 0; } -static uint32_t -unpack0_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack0_32(uint32_t base, const uint8_t *in, uint32_t *out) { int k; (void)in; for (k = 0; k < 32; ++k) { @@ -21,8 +19,7 @@ unpack0_32(uint32_t base, const uint8_t *in, uint32_t *out) { return 0; } -static uint32_t -unpack0_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack0_16(uint32_t base, const uint8_t *in, uint32_t *out) { int k; (void)in; for (k = 0; k < 16; ++k) { @@ -31,8 +28,7 @@ unpack0_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 0; } -static uint32_t -unpack0_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack0_8(uint32_t base, const uint8_t *in, uint32_t *out) { int k; (void)in; for (k = 0; k < 8; ++k) { @@ -41,8 +37,8 @@ unpack0_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 0; } -static uint32_t -pack0_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack0_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { (void)base; (void)in; (void)out; @@ -50,8 +46,8 @@ pack0_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { return 0; } -static uint32_t -unpack0_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack0_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t k; (void)in; for (k = 0; k < length; ++k) { @@ -60,27 +56,25 @@ unpack0_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return 0; } -static uint32_t -linsearch0_n(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch0_n(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { (void)in; if (base == value) *found = 0; return 0; } -static uint32_t -linsearch0_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, - int *found) { +static uint32_t linsearch0_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { (void)in; if (base == value && length > 0) *found = 0; return 0; } -static uint32_t -pack1_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack1_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 1; tmp |= (*(in + 2) - base) << 2; tmp |= (*(in + 3) - base) << 3; @@ -119,50 +113,49 @@ pack1_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 4; } -static uint32_t -unpack1_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack1_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1); - *(out + 1) = base + ((*in32 >> 1) & 1); - *(out + 2) = base + ((*in32 >> 2) & 1); - *(out + 3) = base + ((*in32 >> 3) & 1); - *(out + 4) = base + ((*in32 >> 4) & 1); - *(out + 5) = base + ((*in32 >> 5) & 1); - *(out + 6) = base + ((*in32 >> 6) & 1); - *(out + 7) = base + ((*in32 >> 7) & 1); - *(out + 8) = base + ((*in32 >> 8) & 1); - *(out + 9) = base + ((*in32 >> 9) & 1); - *(out + 10) = base + ((*in32 >> 10) & 1); - *(out + 11) = base + ((*in32 >> 11) & 1); - *(out + 12) = base + ((*in32 >> 12) & 1); - *(out + 13) = base + ((*in32 >> 13) & 1); - *(out + 14) = base + ((*in32 >> 14) & 1); - *(out + 15) = base + ((*in32 >> 15) & 1); - *(out + 16) = base + ((*in32 >> 16) & 1); - *(out + 17) = base + ((*in32 >> 17) & 1); - *(out + 18) = base + ((*in32 >> 18) & 1); - *(out + 19) = base + ((*in32 >> 19) & 1); - *(out + 20) = base + ((*in32 >> 20) & 1); - *(out + 21) = base + ((*in32 >> 21) & 1); - *(out + 22) = base + ((*in32 >> 22) & 1); - *(out + 23) = base + ((*in32 >> 23) & 1); - *(out + 24) = base + ((*in32 >> 24) & 1); - *(out + 25) = base + ((*in32 >> 25) & 1); - *(out + 26) = base + ((*in32 >> 26) & 1); - *(out + 27) = base + ((*in32 >> 27) & 1); - *(out + 28) = base + ((*in32 >> 28) & 1); - *(out + 29) = base + ((*in32 >> 29) & 1); - *(out + 30) = base + ((*in32 >> 30) & 1); - *(out + 31) = base + ((*in32 >> 31) & 1); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1); + *(out + 1) = base + ((*in32 >> 1) & 1); + *(out + 2) = base + ((*in32 >> 2) & 1); + *(out + 3) = base + ((*in32 >> 3) & 1); + *(out + 4) = base + ((*in32 >> 4) & 1); + *(out + 5) = base + ((*in32 >> 5) & 1); + *(out + 6) = base + ((*in32 >> 6) & 1); + *(out + 7) = base + ((*in32 >> 7) & 1); + *(out + 8) = base + ((*in32 >> 8) & 1); + *(out + 9) = base + ((*in32 >> 9) & 1); + *(out + 10) = base + ((*in32 >> 10) & 1); + *(out + 11) = base + ((*in32 >> 11) & 1); + *(out + 12) = base + ((*in32 >> 12) & 1); + *(out + 13) = base + ((*in32 >> 13) & 1); + *(out + 14) = base + ((*in32 >> 14) & 1); + *(out + 15) = base + ((*in32 >> 15) & 1); + *(out + 16) = base + ((*in32 >> 16) & 1); + *(out + 17) = base + ((*in32 >> 17) & 1); + *(out + 18) = base + ((*in32 >> 18) & 1); + *(out + 19) = base + ((*in32 >> 19) & 1); + *(out + 20) = base + ((*in32 >> 20) & 1); + *(out + 21) = base + ((*in32 >> 21) & 1); + *(out + 22) = base + ((*in32 >> 22) & 1); + *(out + 23) = base + ((*in32 >> 23) & 1); + *(out + 24) = base + ((*in32 >> 24) & 1); + *(out + 25) = base + ((*in32 >> 25) & 1); + *(out + 26) = base + ((*in32 >> 26) & 1); + *(out + 27) = base + ((*in32 >> 27) & 1); + *(out + 28) = base + ((*in32 >> 28) & 1); + *(out + 29) = base + ((*in32 >> 29) & 1); + *(out + 30) = base + ((*in32 >> 30) & 1); + *(out + 31) = base + ((*in32 >> 31) & 1); /* remaining: 0 bits */ return 4; } -static uint32_t -pack2_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack2_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 2; tmp |= (*(in + 2) - base) << 4; tmp |= (*(in + 3) - base) << 6; @@ -181,7 +174,7 @@ pack2_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 2; tmp |= (*(in + 18) - base) << 4; tmp |= (*(in + 19) - base) << 6; @@ -204,52 +197,51 @@ pack2_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 8; } -static uint32_t -unpack2_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack2_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 3); - *(out + 1) = base + ((*in32 >> 2) & 3); - *(out + 2) = base + ((*in32 >> 4) & 3); - *(out + 3) = base + ((*in32 >> 6) & 3); - *(out + 4) = base + ((*in32 >> 8) & 3); - *(out + 5) = base + ((*in32 >> 10) & 3); - *(out + 6) = base + ((*in32 >> 12) & 3); - *(out + 7) = base + ((*in32 >> 14) & 3); - *(out + 8) = base + ((*in32 >> 16) & 3); - *(out + 9) = base + ((*in32 >> 18) & 3); - *(out + 10) = base + ((*in32 >> 20) & 3); - *(out + 11) = base + ((*in32 >> 22) & 3); - *(out + 12) = base + ((*in32 >> 24) & 3); - *(out + 13) = base + ((*in32 >> 26) & 3); - *(out + 14) = base + ((*in32 >> 28) & 3); - *(out + 15) = base + ((*in32 >> 30) & 3); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 3); + *(out + 1) = base + ((*in32 >> 2) & 3); + *(out + 2) = base + ((*in32 >> 4) & 3); + *(out + 3) = base + ((*in32 >> 6) & 3); + *(out + 4) = base + ((*in32 >> 8) & 3); + *(out + 5) = base + ((*in32 >> 10) & 3); + *(out + 6) = base + ((*in32 >> 12) & 3); + *(out + 7) = base + ((*in32 >> 14) & 3); + *(out + 8) = base + ((*in32 >> 16) & 3); + *(out + 9) = base + ((*in32 >> 18) & 3); + *(out + 10) = base + ((*in32 >> 20) & 3); + *(out + 11) = base + ((*in32 >> 22) & 3); + *(out + 12) = base + ((*in32 >> 24) & 3); + *(out + 13) = base + ((*in32 >> 26) & 3); + *(out + 14) = base + ((*in32 >> 28) & 3); + *(out + 15) = base + ((*in32 >> 30) & 3); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 16) = base + ((*in32 >> 0) & 3); - *(out + 17) = base + ((*in32 >> 2) & 3); - *(out + 18) = base + ((*in32 >> 4) & 3); - *(out + 19) = base + ((*in32 >> 6) & 3); - *(out + 20) = base + ((*in32 >> 8) & 3); - *(out + 21) = base + ((*in32 >> 10) & 3); - *(out + 22) = base + ((*in32 >> 12) & 3); - *(out + 23) = base + ((*in32 >> 14) & 3); - *(out + 24) = base + ((*in32 >> 16) & 3); - *(out + 25) = base + ((*in32 >> 18) & 3); - *(out + 26) = base + ((*in32 >> 20) & 3); - *(out + 27) = base + ((*in32 >> 22) & 3); - *(out + 28) = base + ((*in32 >> 24) & 3); - *(out + 29) = base + ((*in32 >> 26) & 3); - *(out + 30) = base + ((*in32 >> 28) & 3); - *(out + 31) = base + ((*in32 >> 30) & 3); + *(out + 16) = base + ((*in32 >> 0) & 3); + *(out + 17) = base + ((*in32 >> 2) & 3); + *(out + 18) = base + ((*in32 >> 4) & 3); + *(out + 19) = base + ((*in32 >> 6) & 3); + *(out + 20) = base + ((*in32 >> 8) & 3); + *(out + 21) = base + ((*in32 >> 10) & 3); + *(out + 22) = base + ((*in32 >> 12) & 3); + *(out + 23) = base + ((*in32 >> 14) & 3); + *(out + 24) = base + ((*in32 >> 16) & 3); + *(out + 25) = base + ((*in32 >> 18) & 3); + *(out + 26) = base + ((*in32 >> 20) & 3); + *(out + 27) = base + ((*in32 >> 22) & 3); + *(out + 28) = base + ((*in32 >> 24) & 3); + *(out + 29) = base + ((*in32 >> 26) & 3); + *(out + 30) = base + ((*in32 >> 28) & 3); + *(out + 31) = base + ((*in32 >> 30) & 3); /* remaining: 0 bits */ return 8; } -static uint32_t -pack3_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack3_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 3; tmp |= (*(in + 2) - base) << 6; tmp |= (*(in + 3) - base) << 9; @@ -263,7 +255,7 @@ pack3_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 10) - base) >> (3 - 1); + tmp = (*(in + 10) - base) >> (3 - 1); tmp |= (*(in + 11) - base) << 1; tmp |= (*(in + 12) - base) << 4; tmp |= (*(in + 13) - base) << 7; @@ -278,7 +270,7 @@ pack3_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 21) - base) >> (3 - 2); + tmp = (*(in + 21) - base) >> (3 - 2); tmp |= (*(in + 22) - base) << 2; tmp |= (*(in + 23) - base) << 5; tmp |= (*(in + 24) - base) << 8; @@ -296,58 +288,57 @@ pack3_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 12; } -static uint32_t -unpack3_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack3_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 7); - *(out + 1) = base + ((*in32 >> 3) & 7); - *(out + 2) = base + ((*in32 >> 6) & 7); - *(out + 3) = base + ((*in32 >> 9) & 7); - *(out + 4) = base + ((*in32 >> 12) & 7); - *(out + 5) = base + ((*in32 >> 15) & 7); - *(out + 6) = base + ((*in32 >> 18) & 7); - *(out + 7) = base + ((*in32 >> 21) & 7); - *(out + 8) = base + ((*in32 >> 24) & 7); - *(out + 9) = base + ((*in32 >> 27) & 7); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 7); + *(out + 1) = base + ((*in32 >> 3) & 7); + *(out + 2) = base + ((*in32 >> 6) & 7); + *(out + 3) = base + ((*in32 >> 9) & 7); + *(out + 4) = base + ((*in32 >> 12) & 7); + *(out + 5) = base + ((*in32 >> 15) & 7); + *(out + 6) = base + ((*in32 >> 18) & 7); + *(out + 7) = base + ((*in32 >> 21) & 7); + *(out + 8) = base + ((*in32 >> 24) & 7); + *(out + 9) = base + ((*in32 >> 27) & 7); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 1)) << (3 - 1); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 1) & 7); - *(out + 12) = base + ((*in32 >> 4) & 7); - *(out + 13) = base + ((*in32 >> 7) & 7); - *(out + 14) = base + ((*in32 >> 10) & 7); - *(out + 15) = base + ((*in32 >> 13) & 7); - *(out + 16) = base + ((*in32 >> 16) & 7); - *(out + 17) = base + ((*in32 >> 19) & 7); - *(out + 18) = base + ((*in32 >> 22) & 7); - *(out + 19) = base + ((*in32 >> 25) & 7); - *(out + 20) = base + ((*in32 >> 28) & 7); + *(out + 11) = base + ((*in32 >> 1) & 7); + *(out + 12) = base + ((*in32 >> 4) & 7); + *(out + 13) = base + ((*in32 >> 7) & 7); + *(out + 14) = base + ((*in32 >> 10) & 7); + *(out + 15) = base + ((*in32 >> 13) & 7); + *(out + 16) = base + ((*in32 >> 16) & 7); + *(out + 17) = base + ((*in32 >> 19) & 7); + *(out + 18) = base + ((*in32 >> 22) & 7); + *(out + 19) = base + ((*in32 >> 25) & 7); + *(out + 20) = base + ((*in32 >> 28) & 7); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (3 - 2); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 2) & 7); - *(out + 23) = base + ((*in32 >> 5) & 7); - *(out + 24) = base + ((*in32 >> 8) & 7); - *(out + 25) = base + ((*in32 >> 11) & 7); - *(out + 26) = base + ((*in32 >> 14) & 7); - *(out + 27) = base + ((*in32 >> 17) & 7); - *(out + 28) = base + ((*in32 >> 20) & 7); - *(out + 29) = base + ((*in32 >> 23) & 7); - *(out + 30) = base + ((*in32 >> 26) & 7); - *(out + 31) = base + ((*in32 >> 29) & 7); + *(out + 22) = base + ((*in32 >> 2) & 7); + *(out + 23) = base + ((*in32 >> 5) & 7); + *(out + 24) = base + ((*in32 >> 8) & 7); + *(out + 25) = base + ((*in32 >> 11) & 7); + *(out + 26) = base + ((*in32 >> 14) & 7); + *(out + 27) = base + ((*in32 >> 17) & 7); + *(out + 28) = base + ((*in32 >> 20) & 7); + *(out + 29) = base + ((*in32 >> 23) & 7); + *(out + 30) = base + ((*in32 >> 26) & 7); + *(out + 31) = base + ((*in32 >> 29) & 7); /* remaining: 0 bits */ return 12; } -static uint32_t -pack4_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack4_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 4; tmp |= (*(in + 2) - base) << 8; tmp |= (*(in + 3) - base) << 12; @@ -358,7 +349,7 @@ pack4_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 4; tmp |= (*(in + 10) - base) << 8; tmp |= (*(in + 11) - base) << 12; @@ -369,7 +360,7 @@ pack4_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 4; tmp |= (*(in + 18) - base) << 8; tmp |= (*(in + 19) - base) << 12; @@ -380,7 +371,7 @@ pack4_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 24) - base) << 0; + tmp = (*(in + 24) - base) << 0; tmp |= (*(in + 25) - base) << 4; tmp |= (*(in + 26) - base) << 8; tmp |= (*(in + 27) - base) << 12; @@ -395,56 +386,55 @@ pack4_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 16; } -static uint32_t -unpack4_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack4_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 15); - *(out + 1) = base + ((*in32 >> 4) & 15); - *(out + 2) = base + ((*in32 >> 8) & 15); - *(out + 3) = base + ((*in32 >> 12) & 15); - *(out + 4) = base + ((*in32 >> 16) & 15); - *(out + 5) = base + ((*in32 >> 20) & 15); - *(out + 6) = base + ((*in32 >> 24) & 15); - *(out + 7) = base + ((*in32 >> 28) & 15); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 15); + *(out + 1) = base + ((*in32 >> 4) & 15); + *(out + 2) = base + ((*in32 >> 8) & 15); + *(out + 3) = base + ((*in32 >> 12) & 15); + *(out + 4) = base + ((*in32 >> 16) & 15); + *(out + 5) = base + ((*in32 >> 20) & 15); + *(out + 6) = base + ((*in32 >> 24) & 15); + *(out + 7) = base + ((*in32 >> 28) & 15); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 8) = base + ((*in32 >> 0) & 15); - *(out + 9) = base + ((*in32 >> 4) & 15); - *(out + 10) = base + ((*in32 >> 8) & 15); - *(out + 11) = base + ((*in32 >> 12) & 15); - *(out + 12) = base + ((*in32 >> 16) & 15); - *(out + 13) = base + ((*in32 >> 20) & 15); - *(out + 14) = base + ((*in32 >> 24) & 15); - *(out + 15) = base + ((*in32 >> 28) & 15); + *(out + 8) = base + ((*in32 >> 0) & 15); + *(out + 9) = base + ((*in32 >> 4) & 15); + *(out + 10) = base + ((*in32 >> 8) & 15); + *(out + 11) = base + ((*in32 >> 12) & 15); + *(out + 12) = base + ((*in32 >> 16) & 15); + *(out + 13) = base + ((*in32 >> 20) & 15); + *(out + 14) = base + ((*in32 >> 24) & 15); + *(out + 15) = base + ((*in32 >> 28) & 15); in32++; /* consumed: 4 bytes (total: 12) */ - *(out + 16) = base + ((*in32 >> 0) & 15); - *(out + 17) = base + ((*in32 >> 4) & 15); - *(out + 18) = base + ((*in32 >> 8) & 15); - *(out + 19) = base + ((*in32 >> 12) & 15); - *(out + 20) = base + ((*in32 >> 16) & 15); - *(out + 21) = base + ((*in32 >> 20) & 15); - *(out + 22) = base + ((*in32 >> 24) & 15); - *(out + 23) = base + ((*in32 >> 28) & 15); + *(out + 16) = base + ((*in32 >> 0) & 15); + *(out + 17) = base + ((*in32 >> 4) & 15); + *(out + 18) = base + ((*in32 >> 8) & 15); + *(out + 19) = base + ((*in32 >> 12) & 15); + *(out + 20) = base + ((*in32 >> 16) & 15); + *(out + 21) = base + ((*in32 >> 20) & 15); + *(out + 22) = base + ((*in32 >> 24) & 15); + *(out + 23) = base + ((*in32 >> 28) & 15); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 24) = base + ((*in32 >> 0) & 15); - *(out + 25) = base + ((*in32 >> 4) & 15); - *(out + 26) = base + ((*in32 >> 8) & 15); - *(out + 27) = base + ((*in32 >> 12) & 15); - *(out + 28) = base + ((*in32 >> 16) & 15); - *(out + 29) = base + ((*in32 >> 20) & 15); - *(out + 30) = base + ((*in32 >> 24) & 15); - *(out + 31) = base + ((*in32 >> 28) & 15); + *(out + 24) = base + ((*in32 >> 0) & 15); + *(out + 25) = base + ((*in32 >> 4) & 15); + *(out + 26) = base + ((*in32 >> 8) & 15); + *(out + 27) = base + ((*in32 >> 12) & 15); + *(out + 28) = base + ((*in32 >> 16) & 15); + *(out + 29) = base + ((*in32 >> 20) & 15); + *(out + 30) = base + ((*in32 >> 24) & 15); + *(out + 31) = base + ((*in32 >> 28) & 15); /* remaining: 0 bits */ return 16; } -static uint32_t -pack5_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack5_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 5; tmp |= (*(in + 2) - base) << 10; tmp |= (*(in + 3) - base) << 15; @@ -454,7 +444,7 @@ pack5_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 6) - base) >> (5 - 3); + tmp = (*(in + 6) - base) >> (5 - 3); tmp |= (*(in + 7) - base) << 3; tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 13; @@ -464,7 +454,7 @@ pack5_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 12) - base) >> (5 - 1); + tmp = (*(in + 12) - base) >> (5 - 1); tmp |= (*(in + 13) - base) << 1; tmp |= (*(in + 14) - base) << 6; tmp |= (*(in + 15) - base) << 11; @@ -475,7 +465,7 @@ pack5_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 19) - base) >> (5 - 4); + tmp = (*(in + 19) - base) >> (5 - 4); tmp |= (*(in + 20) - base) << 4; tmp |= (*(in + 21) - base) << 9; tmp |= (*(in + 22) - base) << 14; @@ -485,7 +475,7 @@ pack5_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 25) - base) >> (5 - 2); + tmp = (*(in + 25) - base) >> (5 - 2); tmp |= (*(in + 26) - base) << 2; tmp |= (*(in + 27) - base) << 7; tmp |= (*(in + 28) - base) << 12; @@ -499,66 +489,65 @@ pack5_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 20; } -static uint32_t -unpack5_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack5_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 31); - *(out + 1) = base + ((*in32 >> 5) & 31); - *(out + 2) = base + ((*in32 >> 10) & 31); - *(out + 3) = base + ((*in32 >> 15) & 31); - *(out + 4) = base + ((*in32 >> 20) & 31); - *(out + 5) = base + ((*in32 >> 25) & 31); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 31); + *(out + 1) = base + ((*in32 >> 5) & 31); + *(out + 2) = base + ((*in32 >> 10) & 31); + *(out + 3) = base + ((*in32 >> 15) & 31); + *(out + 4) = base + ((*in32 >> 20) & 31); + *(out + 5) = base + ((*in32 >> 25) & 31); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 3)) << (5 - 3); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 3) & 31); - *(out + 8) = base + ((*in32 >> 8) & 31); - *(out + 9) = base + ((*in32 >> 13) & 31); - *(out + 10) = base + ((*in32 >> 18) & 31); - *(out + 11) = base + ((*in32 >> 23) & 31); + *(out + 7) = base + ((*in32 >> 3) & 31); + *(out + 8) = base + ((*in32 >> 8) & 31); + *(out + 9) = base + ((*in32 >> 13) & 31); + *(out + 10) = base + ((*in32 >> 18) & 31); + *(out + 11) = base + ((*in32 >> 23) & 31); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 1)) << (5 - 1); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 1) & 31); - *(out + 14) = base + ((*in32 >> 6) & 31); - *(out + 15) = base + ((*in32 >> 11) & 31); - *(out + 16) = base + ((*in32 >> 16) & 31); - *(out + 17) = base + ((*in32 >> 21) & 31); - *(out + 18) = base + ((*in32 >> 26) & 31); + *(out + 13) = base + ((*in32 >> 1) & 31); + *(out + 14) = base + ((*in32 >> 6) & 31); + *(out + 15) = base + ((*in32 >> 11) & 31); + *(out + 16) = base + ((*in32 >> 16) & 31); + *(out + 17) = base + ((*in32 >> 21) & 31); + *(out + 18) = base + ((*in32 >> 26) & 31); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (5 - 4); *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 4) & 31); - *(out + 21) = base + ((*in32 >> 9) & 31); - *(out + 22) = base + ((*in32 >> 14) & 31); - *(out + 23) = base + ((*in32 >> 19) & 31); - *(out + 24) = base + ((*in32 >> 24) & 31); + *(out + 20) = base + ((*in32 >> 4) & 31); + *(out + 21) = base + ((*in32 >> 9) & 31); + *(out + 22) = base + ((*in32 >> 14) & 31); + *(out + 23) = base + ((*in32 >> 19) & 31); + *(out + 24) = base + ((*in32 >> 24) & 31); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 2)) << (5 - 2); *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 2) & 31); - *(out + 27) = base + ((*in32 >> 7) & 31); - *(out + 28) = base + ((*in32 >> 12) & 31); - *(out + 29) = base + ((*in32 >> 17) & 31); - *(out + 30) = base + ((*in32 >> 22) & 31); - *(out + 31) = base + ((*in32 >> 27) & 31); + *(out + 26) = base + ((*in32 >> 2) & 31); + *(out + 27) = base + ((*in32 >> 7) & 31); + *(out + 28) = base + ((*in32 >> 12) & 31); + *(out + 29) = base + ((*in32 >> 17) & 31); + *(out + 30) = base + ((*in32 >> 22) & 31); + *(out + 31) = base + ((*in32 >> 27) & 31); /* remaining: 0 bits */ return 20; } -static uint32_t -pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 6; tmp |= (*(in + 2) - base) << 12; tmp |= (*(in + 3) - base) << 18; @@ -567,7 +556,7 @@ pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 5) - base) >> (6 - 4); + tmp = (*(in + 5) - base) >> (6 - 4); tmp |= (*(in + 6) - base) << 4; tmp |= (*(in + 7) - base) << 10; tmp |= (*(in + 8) - base) << 16; @@ -576,7 +565,7 @@ pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 10) - base) >> (6 - 2); + tmp = (*(in + 10) - base) >> (6 - 2); tmp |= (*(in + 11) - base) << 2; tmp |= (*(in + 12) - base) << 8; tmp |= (*(in + 13) - base) << 14; @@ -585,7 +574,7 @@ pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 6; tmp |= (*(in + 18) - base) << 12; tmp |= (*(in + 19) - base) << 18; @@ -594,7 +583,7 @@ pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 21) - base) >> (6 - 4); + tmp = (*(in + 21) - base) >> (6 - 4); tmp |= (*(in + 22) - base) << 4; tmp |= (*(in + 23) - base) << 10; tmp |= (*(in + 24) - base) << 16; @@ -603,7 +592,7 @@ pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 26) - base) >> (6 - 2); + tmp = (*(in + 26) - base) >> (6 - 2); tmp |= (*(in + 27) - base) << 2; tmp |= (*(in + 28) - base) << 8; tmp |= (*(in + 29) - base) << 14; @@ -616,68 +605,67 @@ pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 24; } -static uint32_t -unpack6_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack6_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 63); - *(out + 1) = base + ((*in32 >> 6) & 63); - *(out + 2) = base + ((*in32 >> 12) & 63); - *(out + 3) = base + ((*in32 >> 18) & 63); - *(out + 4) = base + ((*in32 >> 24) & 63); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 63); + *(out + 1) = base + ((*in32 >> 6) & 63); + *(out + 2) = base + ((*in32 >> 12) & 63); + *(out + 3) = base + ((*in32 >> 18) & 63); + *(out + 4) = base + ((*in32 >> 24) & 63); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (6 - 4); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 63); - *(out + 7) = base + ((*in32 >> 10) & 63); - *(out + 8) = base + ((*in32 >> 16) & 63); - *(out + 9) = base + ((*in32 >> 22) & 63); + *(out + 6) = base + ((*in32 >> 4) & 63); + *(out + 7) = base + ((*in32 >> 10) & 63); + *(out + 8) = base + ((*in32 >> 16) & 63); + *(out + 9) = base + ((*in32 >> 22) & 63); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (6 - 2); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 2) & 63); - *(out + 12) = base + ((*in32 >> 8) & 63); - *(out + 13) = base + ((*in32 >> 14) & 63); - *(out + 14) = base + ((*in32 >> 20) & 63); - *(out + 15) = base + ((*in32 >> 26) & 63); + *(out + 11) = base + ((*in32 >> 2) & 63); + *(out + 12) = base + ((*in32 >> 8) & 63); + *(out + 13) = base + ((*in32 >> 14) & 63); + *(out + 14) = base + ((*in32 >> 20) & 63); + *(out + 15) = base + ((*in32 >> 26) & 63); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 16) = base + ((*in32 >> 0) & 63); - *(out + 17) = base + ((*in32 >> 6) & 63); - *(out + 18) = base + ((*in32 >> 12) & 63); - *(out + 19) = base + ((*in32 >> 18) & 63); - *(out + 20) = base + ((*in32 >> 24) & 63); + *(out + 16) = base + ((*in32 >> 0) & 63); + *(out + 17) = base + ((*in32 >> 6) & 63); + *(out + 18) = base + ((*in32 >> 12) & 63); + *(out + 19) = base + ((*in32 >> 18) & 63); + *(out + 20) = base + ((*in32 >> 24) & 63); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 4)) << (6 - 4); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 4) & 63); - *(out + 23) = base + ((*in32 >> 10) & 63); - *(out + 24) = base + ((*in32 >> 16) & 63); - *(out + 25) = base + ((*in32 >> 22) & 63); + *(out + 22) = base + ((*in32 >> 4) & 63); + *(out + 23) = base + ((*in32 >> 10) & 63); + *(out + 24) = base + ((*in32 >> 16) & 63); + *(out + 25) = base + ((*in32 >> 22) & 63); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 2)) << (6 - 2); *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 2) & 63); - *(out + 28) = base + ((*in32 >> 8) & 63); - *(out + 29) = base + ((*in32 >> 14) & 63); - *(out + 30) = base + ((*in32 >> 20) & 63); - *(out + 31) = base + ((*in32 >> 26) & 63); + *(out + 27) = base + ((*in32 >> 2) & 63); + *(out + 28) = base + ((*in32 >> 8) & 63); + *(out + 29) = base + ((*in32 >> 14) & 63); + *(out + 30) = base + ((*in32 >> 20) & 63); + *(out + 31) = base + ((*in32 >> 26) & 63); /* remaining: 0 bits */ return 24; } -static uint32_t -pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 7; tmp |= (*(in + 2) - base) << 14; tmp |= (*(in + 3) - base) << 21; @@ -685,7 +673,7 @@ pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) >> (7 - 3); + tmp = (*(in + 4) - base) >> (7 - 3); tmp |= (*(in + 5) - base) << 3; tmp |= (*(in + 6) - base) << 10; tmp |= (*(in + 7) - base) << 17; @@ -694,7 +682,7 @@ pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 9) - base) >> (7 - 6); + tmp = (*(in + 9) - base) >> (7 - 6); tmp |= (*(in + 10) - base) << 6; tmp |= (*(in + 11) - base) << 13; tmp |= (*(in + 12) - base) << 20; @@ -702,7 +690,7 @@ pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 13) - base) >> (7 - 2); + tmp = (*(in + 13) - base) >> (7 - 2); tmp |= (*(in + 14) - base) << 2; tmp |= (*(in + 15) - base) << 9; tmp |= (*(in + 16) - base) << 16; @@ -711,7 +699,7 @@ pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 18) - base) >> (7 - 5); + tmp = (*(in + 18) - base) >> (7 - 5); tmp |= (*(in + 19) - base) << 5; tmp |= (*(in + 20) - base) << 12; tmp |= (*(in + 21) - base) << 19; @@ -719,7 +707,7 @@ pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 22) - base) >> (7 - 1); + tmp = (*(in + 22) - base) >> (7 - 1); tmp |= (*(in + 23) - base) << 1; tmp |= (*(in + 24) - base) << 8; tmp |= (*(in + 25) - base) << 15; @@ -728,7 +716,7 @@ pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 27) - base) >> (7 - 4); + tmp = (*(in + 27) - base) >> (7 - 4); tmp |= (*(in + 28) - base) << 4; tmp |= (*(in + 29) - base) << 11; tmp |= (*(in + 30) - base) << 18; @@ -740,123 +728,122 @@ pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 28; } -static uint32_t -unpack7_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack7_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 127); - *(out + 1) = base + ((*in32 >> 7) & 127); - *(out + 2) = base + ((*in32 >> 14) & 127); - *(out + 3) = base + ((*in32 >> 21) & 127); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 127); + *(out + 1) = base + ((*in32 >> 7) & 127); + *(out + 2) = base + ((*in32 >> 14) & 127); + *(out + 3) = base + ((*in32 >> 21) & 127); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 3)) << (7 - 3); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 3) & 127); - *(out + 6) = base + ((*in32 >> 10) & 127); - *(out + 7) = base + ((*in32 >> 17) & 127); - *(out + 8) = base + ((*in32 >> 24) & 127); + *(out + 5) = base + ((*in32 >> 3) & 127); + *(out + 6) = base + ((*in32 >> 10) & 127); + *(out + 7) = base + ((*in32 >> 17) & 127); + *(out + 8) = base + ((*in32 >> 24) & 127); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 6)) << (7 - 6); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 6) & 127); - *(out + 11) = base + ((*in32 >> 13) & 127); - *(out + 12) = base + ((*in32 >> 20) & 127); + *(out + 10) = base + ((*in32 >> 6) & 127); + *(out + 11) = base + ((*in32 >> 13) & 127); + *(out + 12) = base + ((*in32 >> 20) & 127); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 2)) << (7 - 2); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 2) & 127); - *(out + 15) = base + ((*in32 >> 9) & 127); - *(out + 16) = base + ((*in32 >> 16) & 127); - *(out + 17) = base + ((*in32 >> 23) & 127); + *(out + 14) = base + ((*in32 >> 2) & 127); + *(out + 15) = base + ((*in32 >> 9) & 127); + *(out + 16) = base + ((*in32 >> 16) & 127); + *(out + 17) = base + ((*in32 >> 23) & 127); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 5)) << (7 - 5); *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 5) & 127); - *(out + 20) = base + ((*in32 >> 12) & 127); - *(out + 21) = base + ((*in32 >> 19) & 127); + *(out + 19) = base + ((*in32 >> 5) & 127); + *(out + 20) = base + ((*in32 >> 12) & 127); + *(out + 21) = base + ((*in32 >> 19) & 127); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 1)) << (7 - 1); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 1) & 127); - *(out + 24) = base + ((*in32 >> 8) & 127); - *(out + 25) = base + ((*in32 >> 15) & 127); - *(out + 26) = base + ((*in32 >> 22) & 127); + *(out + 23) = base + ((*in32 >> 1) & 127); + *(out + 24) = base + ((*in32 >> 8) & 127); + *(out + 25) = base + ((*in32 >> 15) & 127); + *(out + 26) = base + ((*in32 >> 22) & 127); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 4)) << (7 - 4); *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 4) & 127); - *(out + 29) = base + ((*in32 >> 11) & 127); - *(out + 30) = base + ((*in32 >> 18) & 127); - *(out + 31) = base + ((*in32 >> 25) & 127); + *(out + 28) = base + ((*in32 >> 4) & 127); + *(out + 29) = base + ((*in32 >> 11) & 127); + *(out + 30) = base + ((*in32 >> 18) & 127); + *(out + 31) = base + ((*in32 >> 25) & 127); /* remaining: 0 bits */ return 28; } -static uint32_t -pack8_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack8_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 8; tmp |= (*(in + 2) - base) << 16; tmp |= (*(in + 3) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 8; tmp |= (*(in + 6) - base) << 16; tmp |= (*(in + 7) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 8; tmp |= (*(in + 10) - base) << 16; tmp |= (*(in + 11) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 12) - base) << 0; + tmp = (*(in + 12) - base) << 0; tmp |= (*(in + 13) - base) << 8; tmp |= (*(in + 14) - base) << 16; tmp |= (*(in + 15) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 8; tmp |= (*(in + 18) - base) << 16; tmp |= (*(in + 19) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 20) - base) << 0; + tmp = (*(in + 20) - base) << 0; tmp |= (*(in + 21) - base) << 8; tmp |= (*(in + 22) - base) << 16; tmp |= (*(in + 23) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 24) - base) << 0; + tmp = (*(in + 24) - base) << 0; tmp |= (*(in + 25) - base) << 8; tmp |= (*(in + 26) - base) << 16; tmp |= (*(in + 27) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 28) - base) << 0; + tmp = (*(in + 28) - base) << 0; tmp |= (*(in + 29) - base) << 8; tmp |= (*(in + 30) - base) << 16; tmp |= (*(in + 31) - base) << 24; @@ -867,71 +854,70 @@ pack8_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 32; } -static uint32_t -unpack8_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack8_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 255); - *(out + 1) = base + ((*in32 >> 8) & 255); - *(out + 2) = base + ((*in32 >> 16) & 255); - *(out + 3) = base + ((*in32 >> 24) & 255); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 255); + *(out + 1) = base + ((*in32 >> 8) & 255); + *(out + 2) = base + ((*in32 >> 16) & 255); + *(out + 3) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 4) = base + ((*in32 >> 0) & 255); - *(out + 5) = base + ((*in32 >> 8) & 255); - *(out + 6) = base + ((*in32 >> 16) & 255); - *(out + 7) = base + ((*in32 >> 24) & 255); + *(out + 4) = base + ((*in32 >> 0) & 255); + *(out + 5) = base + ((*in32 >> 8) & 255); + *(out + 6) = base + ((*in32 >> 16) & 255); + *(out + 7) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 12) */ - *(out + 8) = base + ((*in32 >> 0) & 255); - *(out + 9) = base + ((*in32 >> 8) & 255); - *(out + 10) = base + ((*in32 >> 16) & 255); - *(out + 11) = base + ((*in32 >> 24) & 255); + *(out + 8) = base + ((*in32 >> 0) & 255); + *(out + 9) = base + ((*in32 >> 8) & 255); + *(out + 10) = base + ((*in32 >> 16) & 255); + *(out + 11) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 12) = base + ((*in32 >> 0) & 255); - *(out + 13) = base + ((*in32 >> 8) & 255); - *(out + 14) = base + ((*in32 >> 16) & 255); - *(out + 15) = base + ((*in32 >> 24) & 255); + *(out + 12) = base + ((*in32 >> 0) & 255); + *(out + 13) = base + ((*in32 >> 8) & 255); + *(out + 14) = base + ((*in32 >> 16) & 255); + *(out + 15) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 20) */ - *(out + 16) = base + ((*in32 >> 0) & 255); - *(out + 17) = base + ((*in32 >> 8) & 255); - *(out + 18) = base + ((*in32 >> 16) & 255); - *(out + 19) = base + ((*in32 >> 24) & 255); + *(out + 16) = base + ((*in32 >> 0) & 255); + *(out + 17) = base + ((*in32 >> 8) & 255); + *(out + 18) = base + ((*in32 >> 16) & 255); + *(out + 19) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 24) */ - *(out + 20) = base + ((*in32 >> 0) & 255); - *(out + 21) = base + ((*in32 >> 8) & 255); - *(out + 22) = base + ((*in32 >> 16) & 255); - *(out + 23) = base + ((*in32 >> 24) & 255); + *(out + 20) = base + ((*in32 >> 0) & 255); + *(out + 21) = base + ((*in32 >> 8) & 255); + *(out + 22) = base + ((*in32 >> 16) & 255); + *(out + 23) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 28) */ - *(out + 24) = base + ((*in32 >> 0) & 255); - *(out + 25) = base + ((*in32 >> 8) & 255); - *(out + 26) = base + ((*in32 >> 16) & 255); - *(out + 27) = base + ((*in32 >> 24) & 255); + *(out + 24) = base + ((*in32 >> 0) & 255); + *(out + 25) = base + ((*in32 >> 8) & 255); + *(out + 26) = base + ((*in32 >> 16) & 255); + *(out + 27) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 32) */ - *(out + 28) = base + ((*in32 >> 0) & 255); - *(out + 29) = base + ((*in32 >> 8) & 255); - *(out + 30) = base + ((*in32 >> 16) & 255); - *(out + 31) = base + ((*in32 >> 24) & 255); + *(out + 28) = base + ((*in32 >> 0) & 255); + *(out + 29) = base + ((*in32 >> 8) & 255); + *(out + 30) = base + ((*in32 >> 16) & 255); + *(out + 31) = base + ((*in32 >> 24) & 255); /* remaining: 0 bits */ return 32; } -static uint32_t -pack9_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack9_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 9; tmp |= (*(in + 2) - base) << 18; tmp |= (*(in + 3) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (9 - 4); + tmp = (*(in + 3) - base) >> (9 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 13; tmp |= (*(in + 6) - base) << 22; @@ -939,14 +925,14 @@ pack9_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 7) - base) >> (9 - 8); + tmp = (*(in + 7) - base) >> (9 - 8); tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 17; tmp |= (*(in + 10) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 10) - base) >> (9 - 3); + tmp = (*(in + 10) - base) >> (9 - 3); tmp |= (*(in + 11) - base) << 3; tmp |= (*(in + 12) - base) << 12; tmp |= (*(in + 13) - base) << 21; @@ -954,14 +940,14 @@ pack9_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 14) - base) >> (9 - 7); + tmp = (*(in + 14) - base) >> (9 - 7); tmp |= (*(in + 15) - base) << 7; tmp |= (*(in + 16) - base) << 16; tmp |= (*(in + 17) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 17) - base) >> (9 - 2); + tmp = (*(in + 17) - base) >> (9 - 2); tmp |= (*(in + 18) - base) << 2; tmp |= (*(in + 19) - base) << 11; tmp |= (*(in + 20) - base) << 20; @@ -969,14 +955,14 @@ pack9_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 21) - base) >> (9 - 6); + tmp = (*(in + 21) - base) >> (9 - 6); tmp |= (*(in + 22) - base) << 6; tmp |= (*(in + 23) - base) << 15; tmp |= (*(in + 24) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 24) - base) >> (9 - 1); + tmp = (*(in + 24) - base) >> (9 - 1); tmp |= (*(in + 25) - base) << 1; tmp |= (*(in + 26) - base) << 10; tmp |= (*(in + 27) - base) << 19; @@ -984,7 +970,7 @@ pack9_32(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 28) - base) >> (9 - 5); + tmp = (*(in + 28) - base) >> (9 - 5); tmp |= (*(in + 29) - base) << 5; tmp |= (*(in + 30) - base) << 14; tmp |= (*(in + 31) - base) << 23; @@ -995,145 +981,144 @@ pack9_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 36; } -static uint32_t -unpack9_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack9_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 511); - *(out + 1) = base + ((*in32 >> 9) & 511); - *(out + 2) = base + ((*in32 >> 18) & 511); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 511); + *(out + 1) = base + ((*in32 >> 9) & 511); + *(out + 2) = base + ((*in32 >> 18) & 511); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (9 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 511); - *(out + 5) = base + ((*in32 >> 13) & 511); - *(out + 6) = base + ((*in32 >> 22) & 511); + *(out + 4) = base + ((*in32 >> 4) & 511); + *(out + 5) = base + ((*in32 >> 13) & 511); + *(out + 6) = base + ((*in32 >> 22) & 511); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (9 - 8); *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 511); - *(out + 9) = base + ((*in32 >> 17) & 511); + *(out + 8) = base + ((*in32 >> 8) & 511); + *(out + 9) = base + ((*in32 >> 17) & 511); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 3)) << (9 - 3); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 3) & 511); - *(out + 12) = base + ((*in32 >> 12) & 511); - *(out + 13) = base + ((*in32 >> 21) & 511); + *(out + 11) = base + ((*in32 >> 3) & 511); + *(out + 12) = base + ((*in32 >> 12) & 511); + *(out + 13) = base + ((*in32 >> 21) & 511); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 7)) << (9 - 7); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 7) & 511); - *(out + 16) = base + ((*in32 >> 16) & 511); + *(out + 15) = base + ((*in32 >> 7) & 511); + *(out + 16) = base + ((*in32 >> 16) & 511); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 2)) << (9 - 2); *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 2) & 511); - *(out + 19) = base + ((*in32 >> 11) & 511); - *(out + 20) = base + ((*in32 >> 20) & 511); + *(out + 18) = base + ((*in32 >> 2) & 511); + *(out + 19) = base + ((*in32 >> 11) & 511); + *(out + 20) = base + ((*in32 >> 20) & 511); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 6)) << (9 - 6); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 6) & 511); - *(out + 23) = base + ((*in32 >> 15) & 511); + *(out + 22) = base + ((*in32 >> 6) & 511); + *(out + 23) = base + ((*in32 >> 15) & 511); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 1)) << (9 - 1); *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 1) & 511); - *(out + 26) = base + ((*in32 >> 10) & 511); - *(out + 27) = base + ((*in32 >> 19) & 511); + *(out + 25) = base + ((*in32 >> 1) & 511); + *(out + 26) = base + ((*in32 >> 10) & 511); + *(out + 27) = base + ((*in32 >> 19) & 511); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 5)) << (9 - 5); *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 5) & 511); - *(out + 30) = base + ((*in32 >> 14) & 511); - *(out + 31) = base + ((*in32 >> 23) & 511); + *(out + 29) = base + ((*in32 >> 5) & 511); + *(out + 30) = base + ((*in32 >> 14) & 511); + *(out + 31) = base + ((*in32 >> 23) & 511); /* remaining: 0 bits */ return 36; } -static uint32_t -pack10_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack10_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 10; tmp |= (*(in + 2) - base) << 20; tmp |= (*(in + 3) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (10 - 8); + tmp = (*(in + 3) - base) >> (10 - 8); tmp |= (*(in + 4) - base) << 8; tmp |= (*(in + 5) - base) << 18; tmp |= (*(in + 6) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 6) - base) >> (10 - 6); + tmp = (*(in + 6) - base) >> (10 - 6); tmp |= (*(in + 7) - base) << 6; tmp |= (*(in + 8) - base) << 16; tmp |= (*(in + 9) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 9) - base) >> (10 - 4); + tmp = (*(in + 9) - base) >> (10 - 4); tmp |= (*(in + 10) - base) << 4; tmp |= (*(in + 11) - base) << 14; tmp |= (*(in + 12) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 12) - base) >> (10 - 2); + tmp = (*(in + 12) - base) >> (10 - 2); tmp |= (*(in + 13) - base) << 2; tmp |= (*(in + 14) - base) << 12; tmp |= (*(in + 15) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 10; tmp |= (*(in + 18) - base) << 20; tmp |= (*(in + 19) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 19) - base) >> (10 - 8); + tmp = (*(in + 19) - base) >> (10 - 8); tmp |= (*(in + 20) - base) << 8; tmp |= (*(in + 21) - base) << 18; tmp |= (*(in + 22) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 22) - base) >> (10 - 6); + tmp = (*(in + 22) - base) >> (10 - 6); tmp |= (*(in + 23) - base) << 6; tmp |= (*(in + 24) - base) << 16; tmp |= (*(in + 25) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 25) - base) >> (10 - 4); + tmp = (*(in + 25) - base) >> (10 - 4); tmp |= (*(in + 26) - base) << 4; tmp |= (*(in + 27) - base) << 14; tmp |= (*(in + 28) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 28) - base) >> (10 - 2); + tmp = (*(in + 28) - base) >> (10 - 2); tmp |= (*(in + 29) - base) << 2; tmp |= (*(in + 30) - base) << 12; tmp |= (*(in + 31) - base) << 22; @@ -1144,153 +1129,152 @@ pack10_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 40; } -static uint32_t -unpack10_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack10_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1023); - *(out + 1) = base + ((*in32 >> 10) & 1023); - *(out + 2) = base + ((*in32 >> 20) & 1023); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1023); + *(out + 1) = base + ((*in32 >> 10) & 1023); + *(out + 2) = base + ((*in32 >> 20) & 1023); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 8)) << (10 - 8); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 1023); - *(out + 5) = base + ((*in32 >> 18) & 1023); + *(out + 4) = base + ((*in32 >> 8) & 1023); + *(out + 5) = base + ((*in32 >> 18) & 1023); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 6)) << (10 - 6); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 6) & 1023); - *(out + 8) = base + ((*in32 >> 16) & 1023); + *(out + 7) = base + ((*in32 >> 6) & 1023); + *(out + 8) = base + ((*in32 >> 16) & 1023); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (10 - 4); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 4) & 1023); - *(out + 11) = base + ((*in32 >> 14) & 1023); + *(out + 10) = base + ((*in32 >> 4) & 1023); + *(out + 11) = base + ((*in32 >> 14) & 1023); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 2)) << (10 - 2); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 2) & 1023); - *(out + 14) = base + ((*in32 >> 12) & 1023); - *(out + 15) = base + ((*in32 >> 22) & 1023); + *(out + 13) = base + ((*in32 >> 2) & 1023); + *(out + 14) = base + ((*in32 >> 12) & 1023); + *(out + 15) = base + ((*in32 >> 22) & 1023); in32++; /* consumed: 4 bytes (total: 24) */ - *(out + 16) = base + ((*in32 >> 0) & 1023); - *(out + 17) = base + ((*in32 >> 10) & 1023); - *(out + 18) = base + ((*in32 >> 20) & 1023); + *(out + 16) = base + ((*in32 >> 0) & 1023); + *(out + 17) = base + ((*in32 >> 10) & 1023); + *(out + 18) = base + ((*in32 >> 20) & 1023); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 8)) << (10 - 8); *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 8) & 1023); - *(out + 21) = base + ((*in32 >> 18) & 1023); + *(out + 20) = base + ((*in32 >> 8) & 1023); + *(out + 21) = base + ((*in32 >> 18) & 1023); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 6)) << (10 - 6); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 6) & 1023); - *(out + 24) = base + ((*in32 >> 16) & 1023); + *(out + 23) = base + ((*in32 >> 6) & 1023); + *(out + 24) = base + ((*in32 >> 16) & 1023); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 4)) << (10 - 4); *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 4) & 1023); - *(out + 27) = base + ((*in32 >> 14) & 1023); + *(out + 26) = base + ((*in32 >> 4) & 1023); + *(out + 27) = base + ((*in32 >> 14) & 1023); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 2)) << (10 - 2); *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 2) & 1023); - *(out + 30) = base + ((*in32 >> 12) & 1023); - *(out + 31) = base + ((*in32 >> 22) & 1023); + *(out + 29) = base + ((*in32 >> 2) & 1023); + *(out + 30) = base + ((*in32 >> 12) & 1023); + *(out + 31) = base + ((*in32 >> 22) & 1023); /* remaining: 0 bits */ return 40; } -static uint32_t -pack11_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack11_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 11; tmp |= (*(in + 2) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (11 - 1); + tmp = (*(in + 2) - base) >> (11 - 1); tmp |= (*(in + 3) - base) << 1; tmp |= (*(in + 4) - base) << 12; tmp |= (*(in + 5) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (11 - 2); + tmp = (*(in + 5) - base) >> (11 - 2); tmp |= (*(in + 6) - base) << 2; tmp |= (*(in + 7) - base) << 13; tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 8) - base) >> (11 - 3); + tmp = (*(in + 8) - base) >> (11 - 3); tmp |= (*(in + 9) - base) << 3; tmp |= (*(in + 10) - base) << 14; tmp |= (*(in + 11) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 11) - base) >> (11 - 4); + tmp = (*(in + 11) - base) >> (11 - 4); tmp |= (*(in + 12) - base) << 4; tmp |= (*(in + 13) - base) << 15; tmp |= (*(in + 14) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 14) - base) >> (11 - 5); + tmp = (*(in + 14) - base) >> (11 - 5); tmp |= (*(in + 15) - base) << 5; tmp |= (*(in + 16) - base) << 16; tmp |= (*(in + 17) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 17) - base) >> (11 - 6); + tmp = (*(in + 17) - base) >> (11 - 6); tmp |= (*(in + 18) - base) << 6; tmp |= (*(in + 19) - base) << 17; tmp |= (*(in + 20) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 20) - base) >> (11 - 7); + tmp = (*(in + 20) - base) >> (11 - 7); tmp |= (*(in + 21) - base) << 7; tmp |= (*(in + 22) - base) << 18; tmp |= (*(in + 23) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 23) - base) >> (11 - 8); + tmp = (*(in + 23) - base) >> (11 - 8); tmp |= (*(in + 24) - base) << 8; tmp |= (*(in + 25) - base) << 19; tmp |= (*(in + 26) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 26) - base) >> (11 - 9); + tmp = (*(in + 26) - base) >> (11 - 9); tmp |= (*(in + 27) - base) << 9; tmp |= (*(in + 28) - base) << 20; tmp |= (*(in + 29) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 29) - base) >> (11 - 10); + tmp = (*(in + 29) - base) >> (11 - 10); tmp |= (*(in + 30) - base) << 10; tmp |= (*(in + 31) - base) << 21; /* remaining: 0 bits */ @@ -1300,160 +1284,159 @@ pack11_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 44; } -static uint32_t -unpack11_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack11_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2047); - *(out + 1) = base + ((*in32 >> 11) & 2047); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2047); + *(out + 1) = base + ((*in32 >> 11) & 2047); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 1)) << (11 - 1); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 1) & 2047); - *(out + 4) = base + ((*in32 >> 12) & 2047); + *(out + 3) = base + ((*in32 >> 1) & 2047); + *(out + 4) = base + ((*in32 >> 12) & 2047); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (11 - 2); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 2047); - *(out + 7) = base + ((*in32 >> 13) & 2047); + *(out + 6) = base + ((*in32 >> 2) & 2047); + *(out + 7) = base + ((*in32 >> 13) & 2047); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 3)) << (11 - 3); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 3) & 2047); - *(out + 10) = base + ((*in32 >> 14) & 2047); + *(out + 9) = base + ((*in32 >> 3) & 2047); + *(out + 10) = base + ((*in32 >> 14) & 2047); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 4)) << (11 - 4); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 2047); - *(out + 13) = base + ((*in32 >> 15) & 2047); + *(out + 12) = base + ((*in32 >> 4) & 2047); + *(out + 13) = base + ((*in32 >> 15) & 2047); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 5)) << (11 - 5); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 5) & 2047); - *(out + 16) = base + ((*in32 >> 16) & 2047); + *(out + 15) = base + ((*in32 >> 5) & 2047); + *(out + 16) = base + ((*in32 >> 16) & 2047); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 6)) << (11 - 6); *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 6) & 2047); - *(out + 19) = base + ((*in32 >> 17) & 2047); + *(out + 18) = base + ((*in32 >> 6) & 2047); + *(out + 19) = base + ((*in32 >> 17) & 2047); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 7)) << (11 - 7); *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 7) & 2047); - *(out + 22) = base + ((*in32 >> 18) & 2047); + *(out + 21) = base + ((*in32 >> 7) & 2047); + *(out + 22) = base + ((*in32 >> 18) & 2047); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 8)) << (11 - 8); *(out + 23) = base + tmp; - *(out + 24) = base + ((*in32 >> 8) & 2047); - *(out + 25) = base + ((*in32 >> 19) & 2047); + *(out + 24) = base + ((*in32 >> 8) & 2047); + *(out + 25) = base + ((*in32 >> 19) & 2047); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 9)) << (11 - 9); *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 9) & 2047); - *(out + 28) = base + ((*in32 >> 20) & 2047); + *(out + 27) = base + ((*in32 >> 9) & 2047); + *(out + 28) = base + ((*in32 >> 20) & 2047); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 10)) << (11 - 10); *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 10) & 2047); - *(out + 31) = base + ((*in32 >> 21) & 2047); + *(out + 30) = base + ((*in32 >> 10) & 2047); + *(out + 31) = base + ((*in32 >> 21) & 2047); /* remaining: 0 bits */ return 44; } -static uint32_t -pack12_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack12_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 12; tmp |= (*(in + 2) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (12 - 4); + tmp = (*(in + 2) - base) >> (12 - 4); tmp |= (*(in + 3) - base) << 4; tmp |= (*(in + 4) - base) << 16; tmp |= (*(in + 5) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (12 - 8); + tmp = (*(in + 5) - base) >> (12 - 8); tmp |= (*(in + 6) - base) << 8; tmp |= (*(in + 7) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 12; tmp |= (*(in + 10) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 10) - base) >> (12 - 4); + tmp = (*(in + 10) - base) >> (12 - 4); tmp |= (*(in + 11) - base) << 4; tmp |= (*(in + 12) - base) << 16; tmp |= (*(in + 13) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 13) - base) >> (12 - 8); + tmp = (*(in + 13) - base) >> (12 - 8); tmp |= (*(in + 14) - base) << 8; tmp |= (*(in + 15) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 12; tmp |= (*(in + 18) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 18) - base) >> (12 - 4); + tmp = (*(in + 18) - base) >> (12 - 4); tmp |= (*(in + 19) - base) << 4; tmp |= (*(in + 20) - base) << 16; tmp |= (*(in + 21) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 21) - base) >> (12 - 8); + tmp = (*(in + 21) - base) >> (12 - 8); tmp |= (*(in + 22) - base) << 8; tmp |= (*(in + 23) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 24) - base) << 0; + tmp = (*(in + 24) - base) << 0; tmp |= (*(in + 25) - base) << 12; tmp |= (*(in + 26) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 26) - base) >> (12 - 4); + tmp = (*(in + 26) - base) >> (12 - 4); tmp |= (*(in + 27) - base) << 4; tmp |= (*(in + 28) - base) << 16; tmp |= (*(in + 29) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 29) - base) >> (12 - 8); + tmp = (*(in + 29) - base) >> (12 - 8); tmp |= (*(in + 30) - base) << 8; tmp |= (*(in + 31) - base) << 20; /* remaining: 0 bits */ @@ -1463,165 +1446,164 @@ pack12_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 48; } -static uint32_t -unpack12_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack12_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4095); - *(out + 1) = base + ((*in32 >> 12) & 4095); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 4095); + *(out + 1) = base + ((*in32 >> 12) & 4095); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (12 - 4); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 4) & 4095); - *(out + 4) = base + ((*in32 >> 16) & 4095); + *(out + 3) = base + ((*in32 >> 4) & 4095); + *(out + 4) = base + ((*in32 >> 16) & 4095); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (12 - 8); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 8) & 4095); - *(out + 7) = base + ((*in32 >> 20) & 4095); + *(out + 6) = base + ((*in32 >> 8) & 4095); + *(out + 7) = base + ((*in32 >> 20) & 4095); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 8) = base + ((*in32 >> 0) & 4095); - *(out + 9) = base + ((*in32 >> 12) & 4095); + *(out + 8) = base + ((*in32 >> 0) & 4095); + *(out + 9) = base + ((*in32 >> 12) & 4095); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 4)) << (12 - 4); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 4) & 4095); - *(out + 12) = base + ((*in32 >> 16) & 4095); + *(out + 11) = base + ((*in32 >> 4) & 4095); + *(out + 12) = base + ((*in32 >> 16) & 4095); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (12 - 8); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 8) & 4095); - *(out + 15) = base + ((*in32 >> 20) & 4095); + *(out + 14) = base + ((*in32 >> 8) & 4095); + *(out + 15) = base + ((*in32 >> 20) & 4095); in32++; /* consumed: 4 bytes (total: 28) */ - *(out + 16) = base + ((*in32 >> 0) & 4095); - *(out + 17) = base + ((*in32 >> 12) & 4095); + *(out + 16) = base + ((*in32 >> 0) & 4095); + *(out + 17) = base + ((*in32 >> 12) & 4095); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 4)) << (12 - 4); *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 4) & 4095); - *(out + 20) = base + ((*in32 >> 16) & 4095); + *(out + 19) = base + ((*in32 >> 4) & 4095); + *(out + 20) = base + ((*in32 >> 16) & 4095); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 8)) << (12 - 8); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 8) & 4095); - *(out + 23) = base + ((*in32 >> 20) & 4095); + *(out + 22) = base + ((*in32 >> 8) & 4095); + *(out + 23) = base + ((*in32 >> 20) & 4095); in32++; /* consumed: 4 bytes (total: 40) */ - *(out + 24) = base + ((*in32 >> 0) & 4095); - *(out + 25) = base + ((*in32 >> 12) & 4095); + *(out + 24) = base + ((*in32 >> 0) & 4095); + *(out + 25) = base + ((*in32 >> 12) & 4095); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 4)) << (12 - 4); *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 4) & 4095); - *(out + 28) = base + ((*in32 >> 16) & 4095); + *(out + 27) = base + ((*in32 >> 4) & 4095); + *(out + 28) = base + ((*in32 >> 16) & 4095); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 8)) << (12 - 8); *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 8) & 4095); - *(out + 31) = base + ((*in32 >> 20) & 4095); + *(out + 30) = base + ((*in32 >> 8) & 4095); + *(out + 31) = base + ((*in32 >> 20) & 4095); /* remaining: 0 bits */ return 48; } -static uint32_t -pack13_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack13_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 13; tmp |= (*(in + 2) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (13 - 7); + tmp = (*(in + 2) - base) >> (13 - 7); tmp |= (*(in + 3) - base) << 7; tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (13 - 1); + tmp = (*(in + 4) - base) >> (13 - 1); tmp |= (*(in + 5) - base) << 1; tmp |= (*(in + 6) - base) << 14; tmp |= (*(in + 7) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 7) - base) >> (13 - 8); + tmp = (*(in + 7) - base) >> (13 - 8); tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 9) - base) >> (13 - 2); + tmp = (*(in + 9) - base) >> (13 - 2); tmp |= (*(in + 10) - base) << 2; tmp |= (*(in + 11) - base) << 15; tmp |= (*(in + 12) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 12) - base) >> (13 - 9); + tmp = (*(in + 12) - base) >> (13 - 9); tmp |= (*(in + 13) - base) << 9; tmp |= (*(in + 14) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 14) - base) >> (13 - 3); + tmp = (*(in + 14) - base) >> (13 - 3); tmp |= (*(in + 15) - base) << 3; tmp |= (*(in + 16) - base) << 16; tmp |= (*(in + 17) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 17) - base) >> (13 - 10); + tmp = (*(in + 17) - base) >> (13 - 10); tmp |= (*(in + 18) - base) << 10; tmp |= (*(in + 19) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 19) - base) >> (13 - 4); + tmp = (*(in + 19) - base) >> (13 - 4); tmp |= (*(in + 20) - base) << 4; tmp |= (*(in + 21) - base) << 17; tmp |= (*(in + 22) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 22) - base) >> (13 - 11); + tmp = (*(in + 22) - base) >> (13 - 11); tmp |= (*(in + 23) - base) << 11; tmp |= (*(in + 24) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 24) - base) >> (13 - 5); + tmp = (*(in + 24) - base) >> (13 - 5); tmp |= (*(in + 25) - base) << 5; tmp |= (*(in + 26) - base) << 18; tmp |= (*(in + 27) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 27) - base) >> (13 - 12); + tmp = (*(in + 27) - base) >> (13 - 12); tmp |= (*(in + 28) - base) << 12; tmp |= (*(in + 29) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 29) - base) >> (13 - 6); + tmp = (*(in + 29) - base) >> (13 - 6); tmp |= (*(in + 30) - base) << 6; tmp |= (*(in + 31) - base) << 19; /* remaining: 0 bits */ @@ -1631,178 +1613,177 @@ pack13_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 52; } -static uint32_t -unpack13_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack13_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8191); - *(out + 1) = base + ((*in32 >> 13) & 8191); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 8191); + *(out + 1) = base + ((*in32 >> 13) & 8191); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 7)) << (13 - 7); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 7) & 8191); + *(out + 3) = base + ((*in32 >> 7) & 8191); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 1)) << (13 - 1); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 1) & 8191); - *(out + 6) = base + ((*in32 >> 14) & 8191); + *(out + 5) = base + ((*in32 >> 1) & 8191); + *(out + 6) = base + ((*in32 >> 14) & 8191); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 8)) << (13 - 8); *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 8191); + *(out + 8) = base + ((*in32 >> 8) & 8191); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 2)) << (13 - 2); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 2) & 8191); - *(out + 11) = base + ((*in32 >> 15) & 8191); + *(out + 10) = base + ((*in32 >> 2) & 8191); + *(out + 11) = base + ((*in32 >> 15) & 8191); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 9)) << (13 - 9); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 9) & 8191); + *(out + 13) = base + ((*in32 >> 9) & 8191); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 3)) << (13 - 3); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 3) & 8191); - *(out + 16) = base + ((*in32 >> 16) & 8191); + *(out + 15) = base + ((*in32 >> 3) & 8191); + *(out + 16) = base + ((*in32 >> 16) & 8191); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 10)) << (13 - 10); *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 10) & 8191); + *(out + 18) = base + ((*in32 >> 10) & 8191); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 4)) << (13 - 4); *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 4) & 8191); - *(out + 21) = base + ((*in32 >> 17) & 8191); + *(out + 20) = base + ((*in32 >> 4) & 8191); + *(out + 21) = base + ((*in32 >> 17) & 8191); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 11)) << (13 - 11); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 11) & 8191); + *(out + 23) = base + ((*in32 >> 11) & 8191); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 5)) << (13 - 5); *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 5) & 8191); - *(out + 26) = base + ((*in32 >> 18) & 8191); + *(out + 25) = base + ((*in32 >> 5) & 8191); + *(out + 26) = base + ((*in32 >> 18) & 8191); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 12)) << (13 - 12); *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 12) & 8191); + *(out + 28) = base + ((*in32 >> 12) & 8191); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 52) */ tmp |= (*in32 % (1U << 6)) << (13 - 6); *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 6) & 8191); - *(out + 31) = base + ((*in32 >> 19) & 8191); + *(out + 30) = base + ((*in32 >> 6) & 8191); + *(out + 31) = base + ((*in32 >> 19) & 8191); /* remaining: 0 bits */ return 52; } -static uint32_t -pack14_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack14_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 14; tmp |= (*(in + 2) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (14 - 10); + tmp = (*(in + 2) - base) >> (14 - 10); tmp |= (*(in + 3) - base) << 10; tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (14 - 6); + tmp = (*(in + 4) - base) >> (14 - 6); tmp |= (*(in + 5) - base) << 6; tmp |= (*(in + 6) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (14 - 2); + tmp = (*(in + 6) - base) >> (14 - 2); tmp |= (*(in + 7) - base) << 2; tmp |= (*(in + 8) - base) << 16; tmp |= (*(in + 9) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 9) - base) >> (14 - 12); + tmp = (*(in + 9) - base) >> (14 - 12); tmp |= (*(in + 10) - base) << 12; tmp |= (*(in + 11) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 11) - base) >> (14 - 8); + tmp = (*(in + 11) - base) >> (14 - 8); tmp |= (*(in + 12) - base) << 8; tmp |= (*(in + 13) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 13) - base) >> (14 - 4); + tmp = (*(in + 13) - base) >> (14 - 4); tmp |= (*(in + 14) - base) << 4; tmp |= (*(in + 15) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 14; tmp |= (*(in + 18) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 18) - base) >> (14 - 10); + tmp = (*(in + 18) - base) >> (14 - 10); tmp |= (*(in + 19) - base) << 10; tmp |= (*(in + 20) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 20) - base) >> (14 - 6); + tmp = (*(in + 20) - base) >> (14 - 6); tmp |= (*(in + 21) - base) << 6; tmp |= (*(in + 22) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 22) - base) >> (14 - 2); + tmp = (*(in + 22) - base) >> (14 - 2); tmp |= (*(in + 23) - base) << 2; tmp |= (*(in + 24) - base) << 16; tmp |= (*(in + 25) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 25) - base) >> (14 - 12); + tmp = (*(in + 25) - base) >> (14 - 12); tmp |= (*(in + 26) - base) << 12; tmp |= (*(in + 27) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 27) - base) >> (14 - 8); + tmp = (*(in + 27) - base) >> (14 - 8); tmp |= (*(in + 28) - base) << 8; tmp |= (*(in + 29) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 29) - base) >> (14 - 4); + tmp = (*(in + 29) - base) >> (14 - 4); tmp |= (*(in + 30) - base) << 4; tmp |= (*(in + 31) - base) << 18; /* remaining: 0 bits */ @@ -1812,185 +1793,184 @@ pack14_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 56; } -static uint32_t -unpack14_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack14_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16383); - *(out + 1) = base + ((*in32 >> 14) & 16383); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 16383); + *(out + 1) = base + ((*in32 >> 14) & 16383); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 10)) << (14 - 10); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 10) & 16383); + *(out + 3) = base + ((*in32 >> 10) & 16383); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 6)) << (14 - 6); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 6) & 16383); + *(out + 5) = base + ((*in32 >> 6) & 16383); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 2)) << (14 - 2); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 2) & 16383); - *(out + 8) = base + ((*in32 >> 16) & 16383); + *(out + 7) = base + ((*in32 >> 2) & 16383); + *(out + 8) = base + ((*in32 >> 16) & 16383); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 12)) << (14 - 12); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 12) & 16383); + *(out + 10) = base + ((*in32 >> 12) & 16383); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (14 - 8); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 8) & 16383); + *(out + 12) = base + ((*in32 >> 8) & 16383); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 4)) << (14 - 4); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 4) & 16383); - *(out + 15) = base + ((*in32 >> 18) & 16383); + *(out + 14) = base + ((*in32 >> 4) & 16383); + *(out + 15) = base + ((*in32 >> 18) & 16383); in32++; /* consumed: 4 bytes (total: 32) */ - *(out + 16) = base + ((*in32 >> 0) & 16383); - *(out + 17) = base + ((*in32 >> 14) & 16383); + *(out + 16) = base + ((*in32 >> 0) & 16383); + *(out + 17) = base + ((*in32 >> 14) & 16383); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 10)) << (14 - 10); *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 10) & 16383); + *(out + 19) = base + ((*in32 >> 10) & 16383); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 6)) << (14 - 6); *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 6) & 16383); + *(out + 21) = base + ((*in32 >> 6) & 16383); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 2)) << (14 - 2); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 2) & 16383); - *(out + 24) = base + ((*in32 >> 16) & 16383); + *(out + 23) = base + ((*in32 >> 2) & 16383); + *(out + 24) = base + ((*in32 >> 16) & 16383); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 12)) << (14 - 12); *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 12) & 16383); + *(out + 26) = base + ((*in32 >> 12) & 16383); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 52) */ tmp |= (*in32 % (1U << 8)) << (14 - 8); *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 8) & 16383); + *(out + 28) = base + ((*in32 >> 8) & 16383); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 4)) << (14 - 4); *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 4) & 16383); - *(out + 31) = base + ((*in32 >> 18) & 16383); + *(out + 30) = base + ((*in32 >> 4) & 16383); + *(out + 31) = base + ((*in32 >> 18) & 16383); /* remaining: 0 bits */ return 56; } -static uint32_t -pack15_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack15_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 15; tmp |= (*(in + 2) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (15 - 13); + tmp = (*(in + 2) - base) >> (15 - 13); tmp |= (*(in + 3) - base) << 13; tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (15 - 11); + tmp = (*(in + 4) - base) >> (15 - 11); tmp |= (*(in + 5) - base) << 11; tmp |= (*(in + 6) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (15 - 9); + tmp = (*(in + 6) - base) >> (15 - 9); tmp |= (*(in + 7) - base) << 9; tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 8) - base) >> (15 - 7); + tmp = (*(in + 8) - base) >> (15 - 7); tmp |= (*(in + 9) - base) << 7; tmp |= (*(in + 10) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 10) - base) >> (15 - 5); + tmp = (*(in + 10) - base) >> (15 - 5); tmp |= (*(in + 11) - base) << 5; tmp |= (*(in + 12) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 12) - base) >> (15 - 3); + tmp = (*(in + 12) - base) >> (15 - 3); tmp |= (*(in + 13) - base) << 3; tmp |= (*(in + 14) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 14) - base) >> (15 - 1); + tmp = (*(in + 14) - base) >> (15 - 1); tmp |= (*(in + 15) - base) << 1; tmp |= (*(in + 16) - base) << 16; tmp |= (*(in + 17) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 17) - base) >> (15 - 14); + tmp = (*(in + 17) - base) >> (15 - 14); tmp |= (*(in + 18) - base) << 14; tmp |= (*(in + 19) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 19) - base) >> (15 - 12); + tmp = (*(in + 19) - base) >> (15 - 12); tmp |= (*(in + 20) - base) << 12; tmp |= (*(in + 21) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 21) - base) >> (15 - 10); + tmp = (*(in + 21) - base) >> (15 - 10); tmp |= (*(in + 22) - base) << 10; tmp |= (*(in + 23) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 23) - base) >> (15 - 8); + tmp = (*(in + 23) - base) >> (15 - 8); tmp |= (*(in + 24) - base) << 8; tmp |= (*(in + 25) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 25) - base) >> (15 - 6); + tmp = (*(in + 25) - base) >> (15 - 6); tmp |= (*(in + 26) - base) << 6; tmp |= (*(in + 27) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 27) - base) >> (15 - 4); + tmp = (*(in + 27) - base) >> (15 - 4); tmp |= (*(in + 28) - base) << 4; tmp |= (*(in + 29) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 29) - base) >> (15 - 2); + tmp = (*(in + 29) - base) >> (15 - 2); tmp |= (*(in + 30) - base) << 2; tmp |= (*(in + 31) - base) << 17; /* remaining: 0 bits */ @@ -2000,181 +1980,180 @@ pack15_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 60; } -static uint32_t -unpack15_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack15_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 32767); - *(out + 1) = base + ((*in32 >> 15) & 32767); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 32767); + *(out + 1) = base + ((*in32 >> 15) & 32767); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 13)) << (15 - 13); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 13) & 32767); + *(out + 3) = base + ((*in32 >> 13) & 32767); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 11)) << (15 - 11); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 11) & 32767); + *(out + 5) = base + ((*in32 >> 11) & 32767); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 9)) << (15 - 9); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 9) & 32767); + *(out + 7) = base + ((*in32 >> 9) & 32767); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 7)) << (15 - 7); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 7) & 32767); + *(out + 9) = base + ((*in32 >> 7) & 32767); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 5)) << (15 - 5); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 5) & 32767); + *(out + 11) = base + ((*in32 >> 5) & 32767); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 3)) << (15 - 3); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 3) & 32767); + *(out + 13) = base + ((*in32 >> 3) & 32767); tmp = (*in32 >> 18); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 1)) << (15 - 1); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 1) & 32767); - *(out + 16) = base + ((*in32 >> 16) & 32767); + *(out + 15) = base + ((*in32 >> 1) & 32767); + *(out + 16) = base + ((*in32 >> 16) & 32767); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 14)) << (15 - 14); *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 14) & 32767); + *(out + 18) = base + ((*in32 >> 14) & 32767); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 12)) << (15 - 12); *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 12) & 32767); + *(out + 20) = base + ((*in32 >> 12) & 32767); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 10)) << (15 - 10); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 10) & 32767); + *(out + 22) = base + ((*in32 >> 10) & 32767); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 8)) << (15 - 8); *(out + 23) = base + tmp; - *(out + 24) = base + ((*in32 >> 8) & 32767); + *(out + 24) = base + ((*in32 >> 8) & 32767); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 52) */ tmp |= (*in32 % (1U << 6)) << (15 - 6); *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 6) & 32767); + *(out + 26) = base + ((*in32 >> 6) & 32767); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 4)) << (15 - 4); *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 4) & 32767); + *(out + 28) = base + ((*in32 >> 4) & 32767); tmp = (*in32 >> 19); in32++; /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 2)) << (15 - 2); *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 2) & 32767); - *(out + 31) = base + ((*in32 >> 17) & 32767); + *(out + 30) = base + ((*in32 >> 2) & 32767); + *(out + 31) = base + ((*in32 >> 17) & 32767); /* remaining: 0 bits */ return 60; } -static uint32_t -pack16_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack16_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) << 0; + tmp = (*(in + 2) - base) << 0; tmp |= (*(in + 3) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) << 0; + tmp = (*(in + 6) - base) << 0; tmp |= (*(in + 7) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 10) - base) << 0; + tmp = (*(in + 10) - base) << 0; tmp |= (*(in + 11) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 12) - base) << 0; + tmp = (*(in + 12) - base) << 0; tmp |= (*(in + 13) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 14) - base) << 0; + tmp = (*(in + 14) - base) << 0; tmp |= (*(in + 15) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 18) - base) << 0; + tmp = (*(in + 18) - base) << 0; tmp |= (*(in + 19) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 20) - base) << 0; + tmp = (*(in + 20) - base) << 0; tmp |= (*(in + 21) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 22) - base) << 0; + tmp = (*(in + 22) - base) << 0; tmp |= (*(in + 23) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 24) - base) << 0; + tmp = (*(in + 24) - base) << 0; tmp |= (*(in + 25) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 26) - base) << 0; + tmp = (*(in + 26) - base) << 0; tmp |= (*(in + 27) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 28) - base) << 0; + tmp = (*(in + 28) - base) << 0; tmp |= (*(in + 29) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 30) - base) << 0; + tmp = (*(in + 30) - base) << 0; tmp |= (*(in + 31) - base) << 16; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -2183,174 +2162,173 @@ pack16_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 64; } -static uint32_t -unpack16_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack16_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 65535); - *(out + 1) = base + ((*in32 >> 16) & 65535); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 65535); + *(out + 1) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 2) = base + ((*in32 >> 0) & 65535); - *(out + 3) = base + ((*in32 >> 16) & 65535); + *(out + 2) = base + ((*in32 >> 0) & 65535); + *(out + 3) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 12) */ - *(out + 4) = base + ((*in32 >> 0) & 65535); - *(out + 5) = base + ((*in32 >> 16) & 65535); + *(out + 4) = base + ((*in32 >> 0) & 65535); + *(out + 5) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 6) = base + ((*in32 >> 0) & 65535); - *(out + 7) = base + ((*in32 >> 16) & 65535); + *(out + 6) = base + ((*in32 >> 0) & 65535); + *(out + 7) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 20) */ - *(out + 8) = base + ((*in32 >> 0) & 65535); - *(out + 9) = base + ((*in32 >> 16) & 65535); + *(out + 8) = base + ((*in32 >> 0) & 65535); + *(out + 9) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 24) */ - *(out + 10) = base + ((*in32 >> 0) & 65535); - *(out + 11) = base + ((*in32 >> 16) & 65535); + *(out + 10) = base + ((*in32 >> 0) & 65535); + *(out + 11) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 28) */ - *(out + 12) = base + ((*in32 >> 0) & 65535); - *(out + 13) = base + ((*in32 >> 16) & 65535); + *(out + 12) = base + ((*in32 >> 0) & 65535); + *(out + 13) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 32) */ - *(out + 14) = base + ((*in32 >> 0) & 65535); - *(out + 15) = base + ((*in32 >> 16) & 65535); + *(out + 14) = base + ((*in32 >> 0) & 65535); + *(out + 15) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 36) */ - *(out + 16) = base + ((*in32 >> 0) & 65535); - *(out + 17) = base + ((*in32 >> 16) & 65535); + *(out + 16) = base + ((*in32 >> 0) & 65535); + *(out + 17) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 40) */ - *(out + 18) = base + ((*in32 >> 0) & 65535); - *(out + 19) = base + ((*in32 >> 16) & 65535); + *(out + 18) = base + ((*in32 >> 0) & 65535); + *(out + 19) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 44) */ - *(out + 20) = base + ((*in32 >> 0) & 65535); - *(out + 21) = base + ((*in32 >> 16) & 65535); + *(out + 20) = base + ((*in32 >> 0) & 65535); + *(out + 21) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 48) */ - *(out + 22) = base + ((*in32 >> 0) & 65535); - *(out + 23) = base + ((*in32 >> 16) & 65535); + *(out + 22) = base + ((*in32 >> 0) & 65535); + *(out + 23) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 52) */ - *(out + 24) = base + ((*in32 >> 0) & 65535); - *(out + 25) = base + ((*in32 >> 16) & 65535); + *(out + 24) = base + ((*in32 >> 0) & 65535); + *(out + 25) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 56) */ - *(out + 26) = base + ((*in32 >> 0) & 65535); - *(out + 27) = base + ((*in32 >> 16) & 65535); + *(out + 26) = base + ((*in32 >> 0) & 65535); + *(out + 27) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 60) */ - *(out + 28) = base + ((*in32 >> 0) & 65535); - *(out + 29) = base + ((*in32 >> 16) & 65535); + *(out + 28) = base + ((*in32 >> 0) & 65535); + *(out + 29) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 64) */ - *(out + 30) = base + ((*in32 >> 0) & 65535); - *(out + 31) = base + ((*in32 >> 16) & 65535); + *(out + 30) = base + ((*in32 >> 0) & 65535); + *(out + 31) = base + ((*in32 >> 16) & 65535); /* remaining: 0 bits */ return 64; } -static uint32_t -pack17_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack17_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (17 - 2); + tmp = (*(in + 1) - base) >> (17 - 2); tmp |= (*(in + 2) - base) << 2; tmp |= (*(in + 3) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (17 - 4); + tmp = (*(in + 3) - base) >> (17 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (17 - 6); + tmp = (*(in + 5) - base) >> (17 - 6); tmp |= (*(in + 6) - base) << 6; tmp |= (*(in + 7) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (17 - 8); + tmp = (*(in + 7) - base) >> (17 - 8); tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 9) - base) >> (17 - 10); + tmp = (*(in + 9) - base) >> (17 - 10); tmp |= (*(in + 10) - base) << 10; tmp |= (*(in + 11) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 11) - base) >> (17 - 12); + tmp = (*(in + 11) - base) >> (17 - 12); tmp |= (*(in + 12) - base) << 12; tmp |= (*(in + 13) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 13) - base) >> (17 - 14); + tmp = (*(in + 13) - base) >> (17 - 14); tmp |= (*(in + 14) - base) << 14; tmp |= (*(in + 15) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 15) - base) >> (17 - 16); + tmp = (*(in + 15) - base) >> (17 - 16); tmp |= (*(in + 16) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 16) - base) >> (17 - 1); + tmp = (*(in + 16) - base) >> (17 - 1); tmp |= (*(in + 17) - base) << 1; tmp |= (*(in + 18) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 18) - base) >> (17 - 3); + tmp = (*(in + 18) - base) >> (17 - 3); tmp |= (*(in + 19) - base) << 3; tmp |= (*(in + 20) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 20) - base) >> (17 - 5); + tmp = (*(in + 20) - base) >> (17 - 5); tmp |= (*(in + 21) - base) << 5; tmp |= (*(in + 22) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 22) - base) >> (17 - 7); + tmp = (*(in + 22) - base) >> (17 - 7); tmp |= (*(in + 23) - base) << 7; tmp |= (*(in + 24) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 24) - base) >> (17 - 9); + tmp = (*(in + 24) - base) >> (17 - 9); tmp |= (*(in + 25) - base) << 9; tmp |= (*(in + 26) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 26) - base) >> (17 - 11); + tmp = (*(in + 26) - base) >> (17 - 11); tmp |= (*(in + 27) - base) << 11; tmp |= (*(in + 28) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 28) - base) >> (17 - 13); + tmp = (*(in + 28) - base) >> (17 - 13); tmp |= (*(in + 29) - base) << 13; tmp |= (*(in + 30) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 30) - base) >> (17 - 15); + tmp = (*(in + 30) - base) >> (17 - 15); tmp |= (*(in + 31) - base) << 15; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -2359,53 +2337,53 @@ pack17_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 68; } -static uint32_t -unpack17_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack17_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 131071); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 131071); tmp = (*in32 >> 17); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 2)) << (17 - 2); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 2) & 131071); + *(out + 2) = base + ((*in32 >> 2) & 131071); tmp = (*in32 >> 19); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 4)) << (17 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 131071); + *(out + 4) = base + ((*in32 >> 4) & 131071); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 6)) << (17 - 6); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 6) & 131071); + *(out + 6) = base + ((*in32 >> 6) & 131071); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 8)) << (17 - 8); *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 131071); + *(out + 8) = base + ((*in32 >> 8) & 131071); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 10)) << (17 - 10); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 10) & 131071); + *(out + 10) = base + ((*in32 >> 10) & 131071); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 12)) << (17 - 12); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 12) & 131071); + *(out + 12) = base + ((*in32 >> 12) & 131071); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 14)) << (17 - 14); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 14) & 131071); + *(out + 14) = base + ((*in32 >> 14) & 131071); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 36) */ @@ -2416,154 +2394,153 @@ unpack17_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 1)) << (17 - 1); *(out + 16) = base + tmp; - *(out + 17) = base + ((*in32 >> 1) & 131071); + *(out + 17) = base + ((*in32 >> 1) & 131071); tmp = (*in32 >> 18); in32++; /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 3)) << (17 - 3); *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 3) & 131071); + *(out + 19) = base + ((*in32 >> 3) & 131071); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 5)) << (17 - 5); *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 5) & 131071); + *(out + 21) = base + ((*in32 >> 5) & 131071); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 52) */ tmp |= (*in32 % (1U << 7)) << (17 - 7); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 7) & 131071); + *(out + 23) = base + ((*in32 >> 7) & 131071); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 9)) << (17 - 9); *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 9) & 131071); + *(out + 25) = base + ((*in32 >> 9) & 131071); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 11)) << (17 - 11); *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 11) & 131071); + *(out + 27) = base + ((*in32 >> 11) & 131071); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 64) */ tmp |= (*in32 % (1U << 13)) << (17 - 13); *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 13) & 131071); + *(out + 29) = base + ((*in32 >> 13) & 131071); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 68) */ tmp |= (*in32 % (1U << 15)) << (17 - 15); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 15) & 131071); + *(out + 31) = base + ((*in32 >> 15) & 131071); /* remaining: 0 bits */ return 68; } -static uint32_t -pack18_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack18_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (18 - 4); + tmp = (*(in + 1) - base) >> (18 - 4); tmp |= (*(in + 2) - base) << 4; tmp |= (*(in + 3) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (18 - 8); + tmp = (*(in + 3) - base) >> (18 - 8); tmp |= (*(in + 4) - base) << 8; tmp |= (*(in + 5) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (18 - 12); + tmp = (*(in + 5) - base) >> (18 - 12); tmp |= (*(in + 6) - base) << 12; tmp |= (*(in + 7) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (18 - 16); + tmp = (*(in + 7) - base) >> (18 - 16); tmp |= (*(in + 8) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) >> (18 - 2); + tmp = (*(in + 8) - base) >> (18 - 2); tmp |= (*(in + 9) - base) << 2; tmp |= (*(in + 10) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 10) - base) >> (18 - 6); + tmp = (*(in + 10) - base) >> (18 - 6); tmp |= (*(in + 11) - base) << 6; tmp |= (*(in + 12) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 12) - base) >> (18 - 10); + tmp = (*(in + 12) - base) >> (18 - 10); tmp |= (*(in + 13) - base) << 10; tmp |= (*(in + 14) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 14) - base) >> (18 - 14); + tmp = (*(in + 14) - base) >> (18 - 14); tmp |= (*(in + 15) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 17) - base) >> (18 - 4); + tmp = (*(in + 17) - base) >> (18 - 4); tmp |= (*(in + 18) - base) << 4; tmp |= (*(in + 19) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 19) - base) >> (18 - 8); + tmp = (*(in + 19) - base) >> (18 - 8); tmp |= (*(in + 20) - base) << 8; tmp |= (*(in + 21) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 21) - base) >> (18 - 12); + tmp = (*(in + 21) - base) >> (18 - 12); tmp |= (*(in + 22) - base) << 12; tmp |= (*(in + 23) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 23) - base) >> (18 - 16); + tmp = (*(in + 23) - base) >> (18 - 16); tmp |= (*(in + 24) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 24) - base) >> (18 - 2); + tmp = (*(in + 24) - base) >> (18 - 2); tmp |= (*(in + 25) - base) << 2; tmp |= (*(in + 26) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 26) - base) >> (18 - 6); + tmp = (*(in + 26) - base) >> (18 - 6); tmp |= (*(in + 27) - base) << 6; tmp |= (*(in + 28) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 28) - base) >> (18 - 10); + tmp = (*(in + 28) - base) >> (18 - 10); tmp |= (*(in + 29) - base) << 10; tmp |= (*(in + 30) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 30) - base) >> (18 - 14); + tmp = (*(in + 30) - base) >> (18 - 14); tmp |= (*(in + 31) - base) << 14; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -2572,29 +2549,29 @@ pack18_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 72; } -static uint32_t -unpack18_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack18_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 262143); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 262143); tmp = (*in32 >> 18); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (18 - 4); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 4) & 262143); + *(out + 2) = base + ((*in32 >> 4) & 262143); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (18 - 8); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 262143); + *(out + 4) = base + ((*in32 >> 8) & 262143); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 12)) << (18 - 12); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 12) & 262143); + *(out + 6) = base + ((*in32 >> 12) & 262143); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ @@ -2605,46 +2582,46 @@ unpack18_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 2)) << (18 - 2); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 2) & 262143); + *(out + 9) = base + ((*in32 >> 2) & 262143); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 6)) << (18 - 6); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 6) & 262143); + *(out + 11) = base + ((*in32 >> 6) & 262143); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 10)) << (18 - 10); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 10) & 262143); + *(out + 13) = base + ((*in32 >> 10) & 262143); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 14)) << (18 - 14); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 14) & 262143); + *(out + 15) = base + ((*in32 >> 14) & 262143); in32++; /* consumed: 4 bytes (total: 40) */ - *(out + 16) = base + ((*in32 >> 0) & 262143); + *(out + 16) = base + ((*in32 >> 0) & 262143); tmp = (*in32 >> 18); in32++; /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 4)) << (18 - 4); *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 4) & 262143); + *(out + 18) = base + ((*in32 >> 4) & 262143); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 8)) << (18 - 8); *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 8) & 262143); + *(out + 20) = base + ((*in32 >> 8) & 262143); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 52) */ tmp |= (*in32 % (1U << 12)) << (18 - 12); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 12) & 262143); + *(out + 22) = base + ((*in32 >> 12) & 262143); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 56) */ @@ -2655,135 +2632,134 @@ unpack18_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 2)) << (18 - 2); *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 2) & 262143); + *(out + 25) = base + ((*in32 >> 2) & 262143); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 64) */ tmp |= (*in32 % (1U << 6)) << (18 - 6); *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 6) & 262143); + *(out + 27) = base + ((*in32 >> 6) & 262143); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 68) */ tmp |= (*in32 % (1U << 10)) << (18 - 10); *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 10) & 262143); + *(out + 29) = base + ((*in32 >> 10) & 262143); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 72) */ tmp |= (*in32 % (1U << 14)) << (18 - 14); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 14) & 262143); + *(out + 31) = base + ((*in32 >> 14) & 262143); /* remaining: 0 bits */ return 72; } -static uint32_t -pack19_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack19_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (19 - 6); + tmp = (*(in + 1) - base) >> (19 - 6); tmp |= (*(in + 2) - base) << 6; tmp |= (*(in + 3) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (19 - 12); + tmp = (*(in + 3) - base) >> (19 - 12); tmp |= (*(in + 4) - base) << 12; tmp |= (*(in + 5) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (19 - 18); + tmp = (*(in + 5) - base) >> (19 - 18); tmp |= (*(in + 6) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (19 - 5); + tmp = (*(in + 6) - base) >> (19 - 5); tmp |= (*(in + 7) - base) << 5; tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) >> (19 - 11); + tmp = (*(in + 8) - base) >> (19 - 11); tmp |= (*(in + 9) - base) << 11; tmp |= (*(in + 10) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 10) - base) >> (19 - 17); + tmp = (*(in + 10) - base) >> (19 - 17); tmp |= (*(in + 11) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 11) - base) >> (19 - 4); + tmp = (*(in + 11) - base) >> (19 - 4); tmp |= (*(in + 12) - base) << 4; tmp |= (*(in + 13) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 13) - base) >> (19 - 10); + tmp = (*(in + 13) - base) >> (19 - 10); tmp |= (*(in + 14) - base) << 10; tmp |= (*(in + 15) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 15) - base) >> (19 - 16); + tmp = (*(in + 15) - base) >> (19 - 16); tmp |= (*(in + 16) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 16) - base) >> (19 - 3); + tmp = (*(in + 16) - base) >> (19 - 3); tmp |= (*(in + 17) - base) << 3; tmp |= (*(in + 18) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 18) - base) >> (19 - 9); + tmp = (*(in + 18) - base) >> (19 - 9); tmp |= (*(in + 19) - base) << 9; tmp |= (*(in + 20) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 20) - base) >> (19 - 15); + tmp = (*(in + 20) - base) >> (19 - 15); tmp |= (*(in + 21) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 21) - base) >> (19 - 2); + tmp = (*(in + 21) - base) >> (19 - 2); tmp |= (*(in + 22) - base) << 2; tmp |= (*(in + 23) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 23) - base) >> (19 - 8); + tmp = (*(in + 23) - base) >> (19 - 8); tmp |= (*(in + 24) - base) << 8; tmp |= (*(in + 25) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 25) - base) >> (19 - 14); + tmp = (*(in + 25) - base) >> (19 - 14); tmp |= (*(in + 26) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 26) - base) >> (19 - 1); + tmp = (*(in + 26) - base) >> (19 - 1); tmp |= (*(in + 27) - base) << 1; tmp |= (*(in + 28) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 28) - base) >> (19 - 7); + tmp = (*(in + 28) - base) >> (19 - 7); tmp |= (*(in + 29) - base) << 7; tmp |= (*(in + 30) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 30) - base) >> (19 - 13); + tmp = (*(in + 30) - base) >> (19 - 13); tmp |= (*(in + 31) - base) << 13; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -2792,23 +2768,23 @@ pack19_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 76; } -static uint32_t -unpack19_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack19_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 524287); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 524287); tmp = (*in32 >> 19); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 6)) << (19 - 6); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 6) & 524287); + *(out + 2) = base + ((*in32 >> 6) & 524287); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 12)) << (19 - 12); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 12) & 524287); + *(out + 4) = base + ((*in32 >> 12) & 524287); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 16) */ @@ -2819,13 +2795,13 @@ unpack19_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 5)) << (19 - 5); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 5) & 524287); + *(out + 7) = base + ((*in32 >> 5) & 524287); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 11)) << (19 - 11); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 11) & 524287); + *(out + 9) = base + ((*in32 >> 11) & 524287); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 28) */ @@ -2836,13 +2812,13 @@ unpack19_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 4)) << (19 - 4); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 524287); + *(out + 12) = base + ((*in32 >> 4) & 524287); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 10)) << (19 - 10); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 10) & 524287); + *(out + 14) = base + ((*in32 >> 10) & 524287); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 40) */ @@ -2853,13 +2829,13 @@ unpack19_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 3)) << (19 - 3); *(out + 16) = base + tmp; - *(out + 17) = base + ((*in32 >> 3) & 524287); + *(out + 17) = base + ((*in32 >> 3) & 524287); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 9)) << (19 - 9); *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 9) & 524287); + *(out + 19) = base + ((*in32 >> 9) & 524287); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 52) */ @@ -2870,13 +2846,13 @@ unpack19_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 2)) << (19 - 2); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 2) & 524287); + *(out + 22) = base + ((*in32 >> 2) & 524287); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 8)) << (19 - 8); *(out + 23) = base + tmp; - *(out + 24) = base + ((*in32 >> 8) & 524287); + *(out + 24) = base + ((*in32 >> 8) & 524287); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 64) */ @@ -2887,130 +2863,129 @@ unpack19_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 68) */ tmp |= (*in32 % (1U << 1)) << (19 - 1); *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 1) & 524287); + *(out + 27) = base + ((*in32 >> 1) & 524287); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 72) */ tmp |= (*in32 % (1U << 7)) << (19 - 7); *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 7) & 524287); + *(out + 29) = base + ((*in32 >> 7) & 524287); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 76) */ tmp |= (*in32 % (1U << 13)) << (19 - 13); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 13) & 524287); + *(out + 31) = base + ((*in32 >> 13) & 524287); /* remaining: 0 bits */ return 76; } -static uint32_t -pack20_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack20_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (20 - 8); + tmp = (*(in + 1) - base) >> (20 - 8); tmp |= (*(in + 2) - base) << 8; tmp |= (*(in + 3) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (20 - 16); + tmp = (*(in + 3) - base) >> (20 - 16); tmp |= (*(in + 4) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (20 - 4); + tmp = (*(in + 4) - base) >> (20 - 4); tmp |= (*(in + 5) - base) << 4; tmp |= (*(in + 6) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (20 - 12); + tmp = (*(in + 6) - base) >> (20 - 12); tmp |= (*(in + 7) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 9) - base) >> (20 - 8); + tmp = (*(in + 9) - base) >> (20 - 8); tmp |= (*(in + 10) - base) << 8; tmp |= (*(in + 11) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 11) - base) >> (20 - 16); + tmp = (*(in + 11) - base) >> (20 - 16); tmp |= (*(in + 12) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 12) - base) >> (20 - 4); + tmp = (*(in + 12) - base) >> (20 - 4); tmp |= (*(in + 13) - base) << 4; tmp |= (*(in + 14) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 14) - base) >> (20 - 12); + tmp = (*(in + 14) - base) >> (20 - 12); tmp |= (*(in + 15) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 17) - base) >> (20 - 8); + tmp = (*(in + 17) - base) >> (20 - 8); tmp |= (*(in + 18) - base) << 8; tmp |= (*(in + 19) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 19) - base) >> (20 - 16); + tmp = (*(in + 19) - base) >> (20 - 16); tmp |= (*(in + 20) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 20) - base) >> (20 - 4); + tmp = (*(in + 20) - base) >> (20 - 4); tmp |= (*(in + 21) - base) << 4; tmp |= (*(in + 22) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 22) - base) >> (20 - 12); + tmp = (*(in + 22) - base) >> (20 - 12); tmp |= (*(in + 23) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 24) - base) << 0; + tmp = (*(in + 24) - base) << 0; tmp |= (*(in + 25) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 25) - base) >> (20 - 8); + tmp = (*(in + 25) - base) >> (20 - 8); tmp |= (*(in + 26) - base) << 8; tmp |= (*(in + 27) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 27) - base) >> (20 - 16); + tmp = (*(in + 27) - base) >> (20 - 16); tmp |= (*(in + 28) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 28) - base) >> (20 - 4); + tmp = (*(in + 28) - base) >> (20 - 4); tmp |= (*(in + 29) - base) << 4; tmp |= (*(in + 30) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 30) - base) >> (20 - 12); + tmp = (*(in + 30) - base) >> (20 - 12); tmp |= (*(in + 31) - base) << 12; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -3019,17 +2994,17 @@ pack20_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 80; } -static uint32_t -unpack20_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack20_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1048575); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1048575); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 8)) << (20 - 8); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 8) & 1048575); + *(out + 2) = base + ((*in32 >> 8) & 1048575); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ @@ -3040,22 +3015,22 @@ unpack20_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (20 - 4); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 4) & 1048575); + *(out + 5) = base + ((*in32 >> 4) & 1048575); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 12)) << (20 - 12); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 12) & 1048575); + *(out + 7) = base + ((*in32 >> 12) & 1048575); in32++; /* consumed: 4 bytes (total: 24) */ - *(out + 8) = base + ((*in32 >> 0) & 1048575); + *(out + 8) = base + ((*in32 >> 0) & 1048575); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 8)) << (20 - 8); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 8) & 1048575); + *(out + 10) = base + ((*in32 >> 8) & 1048575); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 32) */ @@ -3066,22 +3041,22 @@ unpack20_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 4)) << (20 - 4); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 4) & 1048575); + *(out + 13) = base + ((*in32 >> 4) & 1048575); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 12)) << (20 - 12); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 12) & 1048575); + *(out + 15) = base + ((*in32 >> 12) & 1048575); in32++; /* consumed: 4 bytes (total: 44) */ - *(out + 16) = base + ((*in32 >> 0) & 1048575); + *(out + 16) = base + ((*in32 >> 0) & 1048575); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 8)) << (20 - 8); *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 8) & 1048575); + *(out + 18) = base + ((*in32 >> 8) & 1048575); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 52) */ @@ -3092,22 +3067,22 @@ unpack20_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 4)) << (20 - 4); *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 4) & 1048575); + *(out + 21) = base + ((*in32 >> 4) & 1048575); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 12)) << (20 - 12); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 12) & 1048575); + *(out + 23) = base + ((*in32 >> 12) & 1048575); in32++; /* consumed: 4 bytes (total: 64) */ - *(out + 24) = base + ((*in32 >> 0) & 1048575); + *(out + 24) = base + ((*in32 >> 0) & 1048575); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 68) */ tmp |= (*in32 % (1U << 8)) << (20 - 8); *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 8) & 1048575); + *(out + 26) = base + ((*in32 >> 8) & 1048575); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 72) */ @@ -3118,131 +3093,130 @@ unpack20_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 76) */ tmp |= (*in32 % (1U << 4)) << (20 - 4); *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 4) & 1048575); + *(out + 29) = base + ((*in32 >> 4) & 1048575); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 80) */ tmp |= (*in32 % (1U << 12)) << (20 - 12); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 12) & 1048575); + *(out + 31) = base + ((*in32 >> 12) & 1048575); /* remaining: 0 bits */ return 80; } -static uint32_t -pack21_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack21_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (21 - 10); + tmp = (*(in + 1) - base) >> (21 - 10); tmp |= (*(in + 2) - base) << 10; tmp |= (*(in + 3) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (21 - 20); + tmp = (*(in + 3) - base) >> (21 - 20); tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (21 - 9); + tmp = (*(in + 4) - base) >> (21 - 9); tmp |= (*(in + 5) - base) << 9; tmp |= (*(in + 6) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (21 - 19); + tmp = (*(in + 6) - base) >> (21 - 19); tmp |= (*(in + 7) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (21 - 8); + tmp = (*(in + 7) - base) >> (21 - 8); tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 9) - base) >> (21 - 18); + tmp = (*(in + 9) - base) >> (21 - 18); tmp |= (*(in + 10) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 10) - base) >> (21 - 7); + tmp = (*(in + 10) - base) >> (21 - 7); tmp |= (*(in + 11) - base) << 7; tmp |= (*(in + 12) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 12) - base) >> (21 - 17); + tmp = (*(in + 12) - base) >> (21 - 17); tmp |= (*(in + 13) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 13) - base) >> (21 - 6); + tmp = (*(in + 13) - base) >> (21 - 6); tmp |= (*(in + 14) - base) << 6; tmp |= (*(in + 15) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 15) - base) >> (21 - 16); + tmp = (*(in + 15) - base) >> (21 - 16); tmp |= (*(in + 16) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 16) - base) >> (21 - 5); + tmp = (*(in + 16) - base) >> (21 - 5); tmp |= (*(in + 17) - base) << 5; tmp |= (*(in + 18) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 18) - base) >> (21 - 15); + tmp = (*(in + 18) - base) >> (21 - 15); tmp |= (*(in + 19) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 19) - base) >> (21 - 4); + tmp = (*(in + 19) - base) >> (21 - 4); tmp |= (*(in + 20) - base) << 4; tmp |= (*(in + 21) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 21) - base) >> (21 - 14); + tmp = (*(in + 21) - base) >> (21 - 14); tmp |= (*(in + 22) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 22) - base) >> (21 - 3); + tmp = (*(in + 22) - base) >> (21 - 3); tmp |= (*(in + 23) - base) << 3; tmp |= (*(in + 24) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 24) - base) >> (21 - 13); + tmp = (*(in + 24) - base) >> (21 - 13); tmp |= (*(in + 25) - base) << 13; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 25) - base) >> (21 - 2); + tmp = (*(in + 25) - base) >> (21 - 2); tmp |= (*(in + 26) - base) << 2; tmp |= (*(in + 27) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 27) - base) >> (21 - 12); + tmp = (*(in + 27) - base) >> (21 - 12); tmp |= (*(in + 28) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 28) - base) >> (21 - 1); + tmp = (*(in + 28) - base) >> (21 - 1); tmp |= (*(in + 29) - base) << 1; tmp |= (*(in + 30) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 30) - base) >> (21 - 11); + tmp = (*(in + 30) - base) >> (21 - 11); tmp |= (*(in + 31) - base) << 11; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -3251,17 +3225,17 @@ pack21_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 84; } -static uint32_t -unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2097151); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2097151); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 10)) << (21 - 10); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 10) & 2097151); + *(out + 2) = base + ((*in32 >> 10) & 2097151); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ @@ -3272,7 +3246,7 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 9)) << (21 - 9); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 9) & 2097151); + *(out + 5) = base + ((*in32 >> 9) & 2097151); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ @@ -3283,7 +3257,7 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (21 - 8); *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 2097151); + *(out + 8) = base + ((*in32 >> 8) & 2097151); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 28) */ @@ -3294,7 +3268,7 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 7)) << (21 - 7); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 7) & 2097151); + *(out + 11) = base + ((*in32 >> 7) & 2097151); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ @@ -3305,7 +3279,7 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 6)) << (21 - 6); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 6) & 2097151); + *(out + 14) = base + ((*in32 >> 6) & 2097151); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 44) */ @@ -3316,7 +3290,7 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 5)) << (21 - 5); *(out + 16) = base + tmp; - *(out + 17) = base + ((*in32 >> 5) & 2097151); + *(out + 17) = base + ((*in32 >> 5) & 2097151); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 52) */ @@ -3327,7 +3301,7 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 4)) << (21 - 4); *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 4) & 2097151); + *(out + 20) = base + ((*in32 >> 4) & 2097151); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 60) */ @@ -3338,7 +3312,7 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 64) */ tmp |= (*in32 % (1U << 3)) << (21 - 3); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 3) & 2097151); + *(out + 23) = base + ((*in32 >> 3) & 2097151); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 68) */ @@ -3349,7 +3323,7 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 72) */ tmp |= (*in32 % (1U << 2)) << (21 - 2); *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 2) & 2097151); + *(out + 26) = base + ((*in32 >> 2) & 2097151); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 76) */ @@ -3360,134 +3334,133 @@ unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 80) */ tmp |= (*in32 % (1U << 1)) << (21 - 1); *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 1) & 2097151); + *(out + 29) = base + ((*in32 >> 1) & 2097151); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 84) */ tmp |= (*in32 % (1U << 11)) << (21 - 11); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 11) & 2097151); + *(out + 31) = base + ((*in32 >> 11) & 2097151); /* remaining: 0 bits */ return 84; } -static uint32_t -pack22_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack22_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (22 - 12); + tmp = (*(in + 1) - base) >> (22 - 12); tmp |= (*(in + 2) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (22 - 2); + tmp = (*(in + 2) - base) >> (22 - 2); tmp |= (*(in + 3) - base) << 2; tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (22 - 14); + tmp = (*(in + 4) - base) >> (22 - 14); tmp |= (*(in + 5) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (22 - 4); + tmp = (*(in + 5) - base) >> (22 - 4); tmp |= (*(in + 6) - base) << 4; tmp |= (*(in + 7) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (22 - 16); + tmp = (*(in + 7) - base) >> (22 - 16); tmp |= (*(in + 8) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) >> (22 - 6); + tmp = (*(in + 8) - base) >> (22 - 6); tmp |= (*(in + 9) - base) << 6; tmp |= (*(in + 10) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 10) - base) >> (22 - 18); + tmp = (*(in + 10) - base) >> (22 - 18); tmp |= (*(in + 11) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 11) - base) >> (22 - 8); + tmp = (*(in + 11) - base) >> (22 - 8); tmp |= (*(in + 12) - base) << 8; tmp |= (*(in + 13) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 13) - base) >> (22 - 20); + tmp = (*(in + 13) - base) >> (22 - 20); tmp |= (*(in + 14) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 14) - base) >> (22 - 10); + tmp = (*(in + 14) - base) >> (22 - 10); tmp |= (*(in + 15) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 17) - base) >> (22 - 12); + tmp = (*(in + 17) - base) >> (22 - 12); tmp |= (*(in + 18) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 18) - base) >> (22 - 2); + tmp = (*(in + 18) - base) >> (22 - 2); tmp |= (*(in + 19) - base) << 2; tmp |= (*(in + 20) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 20) - base) >> (22 - 14); + tmp = (*(in + 20) - base) >> (22 - 14); tmp |= (*(in + 21) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 21) - base) >> (22 - 4); + tmp = (*(in + 21) - base) >> (22 - 4); tmp |= (*(in + 22) - base) << 4; tmp |= (*(in + 23) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 23) - base) >> (22 - 16); + tmp = (*(in + 23) - base) >> (22 - 16); tmp |= (*(in + 24) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 24) - base) >> (22 - 6); + tmp = (*(in + 24) - base) >> (22 - 6); tmp |= (*(in + 25) - base) << 6; tmp |= (*(in + 26) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 26) - base) >> (22 - 18); + tmp = (*(in + 26) - base) >> (22 - 18); tmp |= (*(in + 27) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 27) - base) >> (22 - 8); + tmp = (*(in + 27) - base) >> (22 - 8); tmp |= (*(in + 28) - base) << 8; tmp |= (*(in + 29) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 29) - base) >> (22 - 20); + tmp = (*(in + 29) - base) >> (22 - 20); tmp |= (*(in + 30) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 30) - base) >> (22 - 10); + tmp = (*(in + 30) - base) >> (22 - 10); tmp |= (*(in + 31) - base) << 10; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -3496,11 +3469,11 @@ pack22_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 88; } -static uint32_t -unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4194303); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 4194303); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 8) */ @@ -3511,7 +3484,7 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (22 - 2); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 2) & 4194303); + *(out + 3) = base + ((*in32 >> 2) & 4194303); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 16) */ @@ -3522,7 +3495,7 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 4)) << (22 - 4); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 4194303); + *(out + 6) = base + ((*in32 >> 4) & 4194303); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 24) */ @@ -3533,7 +3506,7 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 6)) << (22 - 6); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 6) & 4194303); + *(out + 9) = base + ((*in32 >> 6) & 4194303); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 32) */ @@ -3544,7 +3517,7 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 8)) << (22 - 8); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 8) & 4194303); + *(out + 12) = base + ((*in32 >> 8) & 4194303); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 40) */ @@ -3555,10 +3528,10 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 10)) << (22 - 10); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 10) & 4194303); + *(out + 15) = base + ((*in32 >> 10) & 4194303); in32++; /* consumed: 4 bytes (total: 48) */ - *(out + 16) = base + ((*in32 >> 0) & 4194303); + *(out + 16) = base + ((*in32 >> 0) & 4194303); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 52) */ @@ -3569,7 +3542,7 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 2)) << (22 - 2); *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 2) & 4194303); + *(out + 19) = base + ((*in32 >> 2) & 4194303); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 60) */ @@ -3580,7 +3553,7 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 64) */ tmp |= (*in32 % (1U << 4)) << (22 - 4); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 4) & 4194303); + *(out + 22) = base + ((*in32 >> 4) & 4194303); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 68) */ @@ -3591,7 +3564,7 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 72) */ tmp |= (*in32 % (1U << 6)) << (22 - 6); *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 6) & 4194303); + *(out + 25) = base + ((*in32 >> 6) & 4194303); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 76) */ @@ -3602,7 +3575,7 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 80) */ tmp |= (*in32 % (1U << 8)) << (22 - 8); *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 8) & 4194303); + *(out + 28) = base + ((*in32 >> 8) & 4194303); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 84) */ @@ -3613,133 +3586,132 @@ unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 88) */ tmp |= (*in32 % (1U << 10)) << (22 - 10); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 10) & 4194303); + *(out + 31) = base + ((*in32 >> 10) & 4194303); /* remaining: 0 bits */ return 88; } -static uint32_t -pack23_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack23_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (23 - 14); + tmp = (*(in + 1) - base) >> (23 - 14); tmp |= (*(in + 2) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (23 - 5); + tmp = (*(in + 2) - base) >> (23 - 5); tmp |= (*(in + 3) - base) << 5; tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (23 - 19); + tmp = (*(in + 4) - base) >> (23 - 19); tmp |= (*(in + 5) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (23 - 10); + tmp = (*(in + 5) - base) >> (23 - 10); tmp |= (*(in + 6) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (23 - 1); + tmp = (*(in + 6) - base) >> (23 - 1); tmp |= (*(in + 7) - base) << 1; tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) >> (23 - 15); + tmp = (*(in + 8) - base) >> (23 - 15); tmp |= (*(in + 9) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 9) - base) >> (23 - 6); + tmp = (*(in + 9) - base) >> (23 - 6); tmp |= (*(in + 10) - base) << 6; tmp |= (*(in + 11) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 11) - base) >> (23 - 20); + tmp = (*(in + 11) - base) >> (23 - 20); tmp |= (*(in + 12) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 12) - base) >> (23 - 11); + tmp = (*(in + 12) - base) >> (23 - 11); tmp |= (*(in + 13) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 13) - base) >> (23 - 2); + tmp = (*(in + 13) - base) >> (23 - 2); tmp |= (*(in + 14) - base) << 2; tmp |= (*(in + 15) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 15) - base) >> (23 - 16); + tmp = (*(in + 15) - base) >> (23 - 16); tmp |= (*(in + 16) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 16) - base) >> (23 - 7); + tmp = (*(in + 16) - base) >> (23 - 7); tmp |= (*(in + 17) - base) << 7; tmp |= (*(in + 18) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 18) - base) >> (23 - 21); + tmp = (*(in + 18) - base) >> (23 - 21); tmp |= (*(in + 19) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 19) - base) >> (23 - 12); + tmp = (*(in + 19) - base) >> (23 - 12); tmp |= (*(in + 20) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 20) - base) >> (23 - 3); + tmp = (*(in + 20) - base) >> (23 - 3); tmp |= (*(in + 21) - base) << 3; tmp |= (*(in + 22) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 22) - base) >> (23 - 17); + tmp = (*(in + 22) - base) >> (23 - 17); tmp |= (*(in + 23) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 23) - base) >> (23 - 8); + tmp = (*(in + 23) - base) >> (23 - 8); tmp |= (*(in + 24) - base) << 8; tmp |= (*(in + 25) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 25) - base) >> (23 - 22); + tmp = (*(in + 25) - base) >> (23 - 22); tmp |= (*(in + 26) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 26) - base) >> (23 - 13); + tmp = (*(in + 26) - base) >> (23 - 13); tmp |= (*(in + 27) - base) << 13; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 27) - base) >> (23 - 4); + tmp = (*(in + 27) - base) >> (23 - 4); tmp |= (*(in + 28) - base) << 4; tmp |= (*(in + 29) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 29) - base) >> (23 - 18); + tmp = (*(in + 29) - base) >> (23 - 18); tmp |= (*(in + 30) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 30) - base) >> (23 - 9); + tmp = (*(in + 30) - base) >> (23 - 9); tmp |= (*(in + 31) - base) << 9; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -3748,11 +3720,11 @@ pack23_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 92; } -static uint32_t -unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8388607); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 8388607); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 8) */ @@ -3763,7 +3735,7 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 5)) << (23 - 5); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 5) & 8388607); + *(out + 3) = base + ((*in32 >> 5) & 8388607); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 16) */ @@ -3779,7 +3751,7 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 1)) << (23 - 1); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 1) & 8388607); + *(out + 7) = base + ((*in32 >> 1) & 8388607); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 28) */ @@ -3790,7 +3762,7 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 6)) << (23 - 6); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 6) & 8388607); + *(out + 10) = base + ((*in32 >> 6) & 8388607); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 36) */ @@ -3806,7 +3778,7 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 2)) << (23 - 2); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 2) & 8388607); + *(out + 14) = base + ((*in32 >> 2) & 8388607); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 48) */ @@ -3817,7 +3789,7 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 52) */ tmp |= (*in32 % (1U << 7)) << (23 - 7); *(out + 16) = base + tmp; - *(out + 17) = base + ((*in32 >> 7) & 8388607); + *(out + 17) = base + ((*in32 >> 7) & 8388607); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 56) */ @@ -3833,7 +3805,7 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 64) */ tmp |= (*in32 % (1U << 3)) << (23 - 3); *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 3) & 8388607); + *(out + 21) = base + ((*in32 >> 3) & 8388607); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 68) */ @@ -3844,7 +3816,7 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 72) */ tmp |= (*in32 % (1U << 8)) << (23 - 8); *(out + 23) = base + tmp; - *(out + 24) = base + ((*in32 >> 8) & 8388607); + *(out + 24) = base + ((*in32 >> 8) & 8388607); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 76) */ @@ -3860,7 +3832,7 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 84) */ tmp |= (*in32 % (1U << 4)) << (23 - 4); *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 4) & 8388607); + *(out + 28) = base + ((*in32 >> 4) & 8388607); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 88) */ @@ -3871,130 +3843,129 @@ unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 92) */ tmp |= (*in32 % (1U << 9)) << (23 - 9); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 9) & 8388607); + *(out + 31) = base + ((*in32 >> 9) & 8388607); /* remaining: 0 bits */ return 92; } -static uint32_t -pack24_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack24_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (24 - 16); + tmp = (*(in + 1) - base) >> (24 - 16); tmp |= (*(in + 2) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (24 - 8); + tmp = (*(in + 2) - base) >> (24 - 8); tmp |= (*(in + 3) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (24 - 16); + tmp = (*(in + 5) - base) >> (24 - 16); tmp |= (*(in + 6) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (24 - 8); + tmp = (*(in + 6) - base) >> (24 - 8); tmp |= (*(in + 7) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 9) - base) >> (24 - 16); + tmp = (*(in + 9) - base) >> (24 - 16); tmp |= (*(in + 10) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 10) - base) >> (24 - 8); + tmp = (*(in + 10) - base) >> (24 - 8); tmp |= (*(in + 11) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 12) - base) << 0; + tmp = (*(in + 12) - base) << 0; tmp |= (*(in + 13) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 13) - base) >> (24 - 16); + tmp = (*(in + 13) - base) >> (24 - 16); tmp |= (*(in + 14) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 14) - base) >> (24 - 8); + tmp = (*(in + 14) - base) >> (24 - 8); tmp |= (*(in + 15) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 17) - base) >> (24 - 16); + tmp = (*(in + 17) - base) >> (24 - 16); tmp |= (*(in + 18) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 18) - base) >> (24 - 8); + tmp = (*(in + 18) - base) >> (24 - 8); tmp |= (*(in + 19) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 20) - base) << 0; + tmp = (*(in + 20) - base) << 0; tmp |= (*(in + 21) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 21) - base) >> (24 - 16); + tmp = (*(in + 21) - base) >> (24 - 16); tmp |= (*(in + 22) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 22) - base) >> (24 - 8); + tmp = (*(in + 22) - base) >> (24 - 8); tmp |= (*(in + 23) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 24) - base) << 0; + tmp = (*(in + 24) - base) << 0; tmp |= (*(in + 25) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 25) - base) >> (24 - 16); + tmp = (*(in + 25) - base) >> (24 - 16); tmp |= (*(in + 26) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 26) - base) >> (24 - 8); + tmp = (*(in + 26) - base) >> (24 - 8); tmp |= (*(in + 27) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 28) - base) << 0; + tmp = (*(in + 28) - base) << 0; tmp |= (*(in + 29) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 29) - base) >> (24 - 16); + tmp = (*(in + 29) - base) >> (24 - 16); tmp |= (*(in + 30) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 30) - base) >> (24 - 8); + tmp = (*(in + 30) - base) >> (24 - 8); tmp |= (*(in + 31) - base) << 8; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -4003,11 +3974,11 @@ pack24_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 96; } -static uint32_t -unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16777215); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 8) */ @@ -4018,10 +3989,10 @@ unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 8) & 16777215); + *(out + 3) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 4) = base + ((*in32 >> 0) & 16777215); + *(out + 4) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ @@ -4032,10 +4003,10 @@ unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 8) & 16777215); + *(out + 7) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 28) */ - *(out + 8) = base + ((*in32 >> 0) & 16777215); + *(out + 8) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 32) */ @@ -4046,10 +4017,10 @@ unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 8) & 16777215); + *(out + 11) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 40) */ - *(out + 12) = base + ((*in32 >> 0) & 16777215); + *(out + 12) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 44) */ @@ -4060,10 +4031,10 @@ unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 8) & 16777215); + *(out + 15) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 52) */ - *(out + 16) = base + ((*in32 >> 0) & 16777215); + *(out + 16) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 56) */ @@ -4074,10 +4045,10 @@ unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 8) & 16777215); + *(out + 19) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 64) */ - *(out + 20) = base + ((*in32 >> 0) & 16777215); + *(out + 20) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 68) */ @@ -4088,10 +4059,10 @@ unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 72) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 8) & 16777215); + *(out + 23) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 76) */ - *(out + 24) = base + ((*in32 >> 0) & 16777215); + *(out + 24) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 80) */ @@ -4102,10 +4073,10 @@ unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 84) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 8) & 16777215); + *(out + 27) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 88) */ - *(out + 28) = base + ((*in32 >> 0) & 16777215); + *(out + 28) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 92) */ @@ -4116,141 +4087,140 @@ unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 96) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 8) & 16777215); + *(out + 31) = base + ((*in32 >> 8) & 16777215); /* remaining: 0 bits */ return 96; } -static uint32_t -pack25_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack25_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (25 - 18); + tmp = (*(in + 1) - base) >> (25 - 18); tmp |= (*(in + 2) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (25 - 11); + tmp = (*(in + 2) - base) >> (25 - 11); tmp |= (*(in + 3) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (25 - 4); + tmp = (*(in + 3) - base) >> (25 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (25 - 22); + tmp = (*(in + 5) - base) >> (25 - 22); tmp |= (*(in + 6) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (25 - 15); + tmp = (*(in + 6) - base) >> (25 - 15); tmp |= (*(in + 7) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (25 - 8); + tmp = (*(in + 7) - base) >> (25 - 8); tmp |= (*(in + 8) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (25 - 1); + tmp = (*(in + 8) - base) >> (25 - 1); tmp |= (*(in + 9) - base) << 1; tmp |= (*(in + 10) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 10) - base) >> (25 - 19); + tmp = (*(in + 10) - base) >> (25 - 19); tmp |= (*(in + 11) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 11) - base) >> (25 - 12); + tmp = (*(in + 11) - base) >> (25 - 12); tmp |= (*(in + 12) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 12) - base) >> (25 - 5); + tmp = (*(in + 12) - base) >> (25 - 5); tmp |= (*(in + 13) - base) << 5; tmp |= (*(in + 14) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 14) - base) >> (25 - 23); + tmp = (*(in + 14) - base) >> (25 - 23); tmp |= (*(in + 15) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 15) - base) >> (25 - 16); + tmp = (*(in + 15) - base) >> (25 - 16); tmp |= (*(in + 16) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 16) - base) >> (25 - 9); + tmp = (*(in + 16) - base) >> (25 - 9); tmp |= (*(in + 17) - base) << 9; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 17) - base) >> (25 - 2); + tmp = (*(in + 17) - base) >> (25 - 2); tmp |= (*(in + 18) - base) << 2; tmp |= (*(in + 19) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 19) - base) >> (25 - 20); + tmp = (*(in + 19) - base) >> (25 - 20); tmp |= (*(in + 20) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 20) - base) >> (25 - 13); + tmp = (*(in + 20) - base) >> (25 - 13); tmp |= (*(in + 21) - base) << 13; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 21) - base) >> (25 - 6); + tmp = (*(in + 21) - base) >> (25 - 6); tmp |= (*(in + 22) - base) << 6; tmp |= (*(in + 23) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 23) - base) >> (25 - 24); + tmp = (*(in + 23) - base) >> (25 - 24); tmp |= (*(in + 24) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 24) - base) >> (25 - 17); + tmp = (*(in + 24) - base) >> (25 - 17); tmp |= (*(in + 25) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 25) - base) >> (25 - 10); + tmp = (*(in + 25) - base) >> (25 - 10); tmp |= (*(in + 26) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 26) - base) >> (25 - 3); + tmp = (*(in + 26) - base) >> (25 - 3); tmp |= (*(in + 27) - base) << 3; tmp |= (*(in + 28) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 28) - base) >> (25 - 21); + tmp = (*(in + 28) - base) >> (25 - 21); tmp |= (*(in + 29) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 29) - base) >> (25 - 14); + tmp = (*(in + 29) - base) >> (25 - 14); tmp |= (*(in + 30) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 30) - base) >> (25 - 7); + tmp = (*(in + 30) - base) >> (25 - 7); tmp |= (*(in + 31) - base) << 7; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -4259,11 +4229,11 @@ pack25_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 100; } -static uint32_t -unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 33554431); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 33554431); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 8) */ @@ -4279,7 +4249,7 @@ unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (25 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 33554431); + *(out + 4) = base + ((*in32 >> 4) & 33554431); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 20) */ @@ -4300,7 +4270,7 @@ unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 1)) << (25 - 1); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 1) & 33554431); + *(out + 9) = base + ((*in32 >> 1) & 33554431); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 36) */ @@ -4316,7 +4286,7 @@ unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 5)) << (25 - 5); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 5) & 33554431); + *(out + 13) = base + ((*in32 >> 5) & 33554431); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 48) */ @@ -4337,7 +4307,7 @@ unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 2)) << (25 - 2); *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 2) & 33554431); + *(out + 18) = base + ((*in32 >> 2) & 33554431); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 64) */ @@ -4353,7 +4323,7 @@ unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 72) */ tmp |= (*in32 % (1U << 6)) << (25 - 6); *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 6) & 33554431); + *(out + 22) = base + ((*in32 >> 6) & 33554431); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 76) */ @@ -4374,7 +4344,7 @@ unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 88) */ tmp |= (*in32 % (1U << 3)) << (25 - 3); *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 3) & 33554431); + *(out + 27) = base + ((*in32 >> 3) & 33554431); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 92) */ @@ -4390,144 +4360,143 @@ unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 100) */ tmp |= (*in32 % (1U << 7)) << (25 - 7); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 7) & 33554431); + *(out + 31) = base + ((*in32 >> 7) & 33554431); /* remaining: 0 bits */ return 100; } -static uint32_t -pack26_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack26_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (26 - 20); + tmp = (*(in + 1) - base) >> (26 - 20); tmp |= (*(in + 2) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (26 - 14); + tmp = (*(in + 2) - base) >> (26 - 14); tmp |= (*(in + 3) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (26 - 8); + tmp = (*(in + 3) - base) >> (26 - 8); tmp |= (*(in + 4) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (26 - 2); + tmp = (*(in + 4) - base) >> (26 - 2); tmp |= (*(in + 5) - base) << 2; tmp |= (*(in + 6) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (26 - 22); + tmp = (*(in + 6) - base) >> (26 - 22); tmp |= (*(in + 7) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (26 - 16); + tmp = (*(in + 7) - base) >> (26 - 16); tmp |= (*(in + 8) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (26 - 10); + tmp = (*(in + 8) - base) >> (26 - 10); tmp |= (*(in + 9) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (26 - 4); + tmp = (*(in + 9) - base) >> (26 - 4); tmp |= (*(in + 10) - base) << 4; tmp |= (*(in + 11) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 11) - base) >> (26 - 24); + tmp = (*(in + 11) - base) >> (26 - 24); tmp |= (*(in + 12) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 12) - base) >> (26 - 18); + tmp = (*(in + 12) - base) >> (26 - 18); tmp |= (*(in + 13) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 13) - base) >> (26 - 12); + tmp = (*(in + 13) - base) >> (26 - 12); tmp |= (*(in + 14) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 14) - base) >> (26 - 6); + tmp = (*(in + 14) - base) >> (26 - 6); tmp |= (*(in + 15) - base) << 6; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 17) - base) >> (26 - 20); + tmp = (*(in + 17) - base) >> (26 - 20); tmp |= (*(in + 18) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 18) - base) >> (26 - 14); + tmp = (*(in + 18) - base) >> (26 - 14); tmp |= (*(in + 19) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 19) - base) >> (26 - 8); + tmp = (*(in + 19) - base) >> (26 - 8); tmp |= (*(in + 20) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 20) - base) >> (26 - 2); + tmp = (*(in + 20) - base) >> (26 - 2); tmp |= (*(in + 21) - base) << 2; tmp |= (*(in + 22) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 22) - base) >> (26 - 22); + tmp = (*(in + 22) - base) >> (26 - 22); tmp |= (*(in + 23) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 23) - base) >> (26 - 16); + tmp = (*(in + 23) - base) >> (26 - 16); tmp |= (*(in + 24) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 24) - base) >> (26 - 10); + tmp = (*(in + 24) - base) >> (26 - 10); tmp |= (*(in + 25) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 25) - base) >> (26 - 4); + tmp = (*(in + 25) - base) >> (26 - 4); tmp |= (*(in + 26) - base) << 4; tmp |= (*(in + 27) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 27) - base) >> (26 - 24); + tmp = (*(in + 27) - base) >> (26 - 24); tmp |= (*(in + 28) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 28) - base) >> (26 - 18); + tmp = (*(in + 28) - base) >> (26 - 18); tmp |= (*(in + 29) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 29) - base) >> (26 - 12); + tmp = (*(in + 29) - base) >> (26 - 12); tmp |= (*(in + 30) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 30) - base) >> (26 - 6); + tmp = (*(in + 30) - base) >> (26 - 6); tmp |= (*(in + 31) - base) << 6; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -4536,11 +4505,11 @@ pack26_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 104; } -static uint32_t -unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 67108863); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 67108863); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 8) */ @@ -4561,7 +4530,7 @@ unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 2)) << (26 - 2); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 2) & 67108863); + *(out + 5) = base + ((*in32 >> 2) & 67108863); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 24) */ @@ -4582,7 +4551,7 @@ unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 4)) << (26 - 4); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 4) & 67108863); + *(out + 10) = base + ((*in32 >> 4) & 67108863); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 40) */ @@ -4603,10 +4572,10 @@ unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 52) */ tmp |= (*in32 % (1U << 6)) << (26 - 6); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 6) & 67108863); + *(out + 15) = base + ((*in32 >> 6) & 67108863); in32++; /* consumed: 4 bytes (total: 56) */ - *(out + 16) = base + ((*in32 >> 0) & 67108863); + *(out + 16) = base + ((*in32 >> 0) & 67108863); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 60) */ @@ -4627,7 +4596,7 @@ unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 72) */ tmp |= (*in32 % (1U << 2)) << (26 - 2); *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 2) & 67108863); + *(out + 21) = base + ((*in32 >> 2) & 67108863); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 76) */ @@ -4648,7 +4617,7 @@ unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 88) */ tmp |= (*in32 % (1U << 4)) << (26 - 4); *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 4) & 67108863); + *(out + 26) = base + ((*in32 >> 4) & 67108863); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 92) */ @@ -4669,149 +4638,148 @@ unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 104) */ tmp |= (*in32 % (1U << 6)) << (26 - 6); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 6) & 67108863); + *(out + 31) = base + ((*in32 >> 6) & 67108863); /* remaining: 0 bits */ return 104; } -static uint32_t -pack27_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack27_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (27 - 22); + tmp = (*(in + 1) - base) >> (27 - 22); tmp |= (*(in + 2) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (27 - 17); + tmp = (*(in + 2) - base) >> (27 - 17); tmp |= (*(in + 3) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (27 - 12); + tmp = (*(in + 3) - base) >> (27 - 12); tmp |= (*(in + 4) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (27 - 7); + tmp = (*(in + 4) - base) >> (27 - 7); tmp |= (*(in + 5) - base) << 7; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (27 - 2); + tmp = (*(in + 5) - base) >> (27 - 2); tmp |= (*(in + 6) - base) << 2; tmp |= (*(in + 7) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (27 - 24); + tmp = (*(in + 7) - base) >> (27 - 24); tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (27 - 19); + tmp = (*(in + 8) - base) >> (27 - 19); tmp |= (*(in + 9) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (27 - 14); + tmp = (*(in + 9) - base) >> (27 - 14); tmp |= (*(in + 10) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 10) - base) >> (27 - 9); + tmp = (*(in + 10) - base) >> (27 - 9); tmp |= (*(in + 11) - base) << 9; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (27 - 4); + tmp = (*(in + 11) - base) >> (27 - 4); tmp |= (*(in + 12) - base) << 4; tmp |= (*(in + 13) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 13) - base) >> (27 - 26); + tmp = (*(in + 13) - base) >> (27 - 26); tmp |= (*(in + 14) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 14) - base) >> (27 - 21); + tmp = (*(in + 14) - base) >> (27 - 21); tmp |= (*(in + 15) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 15) - base) >> (27 - 16); + tmp = (*(in + 15) - base) >> (27 - 16); tmp |= (*(in + 16) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 16) - base) >> (27 - 11); + tmp = (*(in + 16) - base) >> (27 - 11); tmp |= (*(in + 17) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 17) - base) >> (27 - 6); + tmp = (*(in + 17) - base) >> (27 - 6); tmp |= (*(in + 18) - base) << 6; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 18) - base) >> (27 - 1); + tmp = (*(in + 18) - base) >> (27 - 1); tmp |= (*(in + 19) - base) << 1; tmp |= (*(in + 20) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 20) - base) >> (27 - 23); + tmp = (*(in + 20) - base) >> (27 - 23); tmp |= (*(in + 21) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 21) - base) >> (27 - 18); + tmp = (*(in + 21) - base) >> (27 - 18); tmp |= (*(in + 22) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 22) - base) >> (27 - 13); + tmp = (*(in + 22) - base) >> (27 - 13); tmp |= (*(in + 23) - base) << 13; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 23) - base) >> (27 - 8); + tmp = (*(in + 23) - base) >> (27 - 8); tmp |= (*(in + 24) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 24) - base) >> (27 - 3); + tmp = (*(in + 24) - base) >> (27 - 3); tmp |= (*(in + 25) - base) << 3; tmp |= (*(in + 26) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 26) - base) >> (27 - 25); + tmp = (*(in + 26) - base) >> (27 - 25); tmp |= (*(in + 27) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 27) - base) >> (27 - 20); + tmp = (*(in + 27) - base) >> (27 - 20); tmp |= (*(in + 28) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 28) - base) >> (27 - 15); + tmp = (*(in + 28) - base) >> (27 - 15); tmp |= (*(in + 29) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 29) - base) >> (27 - 10); + tmp = (*(in + 29) - base) >> (27 - 10); tmp |= (*(in + 30) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 30) - base) >> (27 - 5); + tmp = (*(in + 30) - base) >> (27 - 5); tmp |= (*(in + 31) - base) << 5; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -4820,11 +4788,11 @@ pack27_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 108; } -static uint32_t -unpack27_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack27_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 134217727); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 134217727); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 8) */ @@ -4850,7 +4818,7 @@ unpack27_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 2)) << (27 - 2); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 134217727); + *(out + 6) = base + ((*in32 >> 2) & 134217727); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 28) */ @@ -4876,7 +4844,7 @@ unpack27_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 4)) << (27 - 4); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 134217727); + *(out + 12) = base + ((*in32 >> 4) & 134217727); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 48) */ @@ -4907,7 +4875,7 @@ unpack27_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 68) */ tmp |= (*in32 % (1U << 1)) << (27 - 1); *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 1) & 134217727); + *(out + 19) = base + ((*in32 >> 1) & 134217727); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 72) */ @@ -4933,7 +4901,7 @@ unpack27_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 88) */ tmp |= (*in32 % (1U << 3)) << (27 - 3); *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 3) & 134217727); + *(out + 25) = base + ((*in32 >> 3) & 134217727); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 92) */ @@ -4959,150 +4927,149 @@ unpack27_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 108) */ tmp |= (*in32 % (1U << 5)) << (27 - 5); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 5) & 134217727); + *(out + 31) = base + ((*in32 >> 5) & 134217727); /* remaining: 0 bits */ return 108; } -static uint32_t -pack28_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack28_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (28 - 24); + tmp = (*(in + 1) - base) >> (28 - 24); tmp |= (*(in + 2) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (28 - 20); + tmp = (*(in + 2) - base) >> (28 - 20); tmp |= (*(in + 3) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (28 - 16); + tmp = (*(in + 3) - base) >> (28 - 16); tmp |= (*(in + 4) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (28 - 12); + tmp = (*(in + 4) - base) >> (28 - 12); tmp |= (*(in + 5) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (28 - 8); + tmp = (*(in + 5) - base) >> (28 - 8); tmp |= (*(in + 6) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (28 - 4); + tmp = (*(in + 6) - base) >> (28 - 4); tmp |= (*(in + 7) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (28 - 24); + tmp = (*(in + 9) - base) >> (28 - 24); tmp |= (*(in + 10) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 10) - base) >> (28 - 20); + tmp = (*(in + 10) - base) >> (28 - 20); tmp |= (*(in + 11) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (28 - 16); + tmp = (*(in + 11) - base) >> (28 - 16); tmp |= (*(in + 12) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 12) - base) >> (28 - 12); + tmp = (*(in + 12) - base) >> (28 - 12); tmp |= (*(in + 13) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 13) - base) >> (28 - 8); + tmp = (*(in + 13) - base) >> (28 - 8); tmp |= (*(in + 14) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 14) - base) >> (28 - 4); + tmp = (*(in + 14) - base) >> (28 - 4); tmp |= (*(in + 15) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 17) - base) >> (28 - 24); + tmp = (*(in + 17) - base) >> (28 - 24); tmp |= (*(in + 18) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 18) - base) >> (28 - 20); + tmp = (*(in + 18) - base) >> (28 - 20); tmp |= (*(in + 19) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 19) - base) >> (28 - 16); + tmp = (*(in + 19) - base) >> (28 - 16); tmp |= (*(in + 20) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 20) - base) >> (28 - 12); + tmp = (*(in + 20) - base) >> (28 - 12); tmp |= (*(in + 21) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 21) - base) >> (28 - 8); + tmp = (*(in + 21) - base) >> (28 - 8); tmp |= (*(in + 22) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 22) - base) >> (28 - 4); + tmp = (*(in + 22) - base) >> (28 - 4); tmp |= (*(in + 23) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 24) - base) << 0; + tmp = (*(in + 24) - base) << 0; tmp |= (*(in + 25) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 25) - base) >> (28 - 24); + tmp = (*(in + 25) - base) >> (28 - 24); tmp |= (*(in + 26) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 26) - base) >> (28 - 20); + tmp = (*(in + 26) - base) >> (28 - 20); tmp |= (*(in + 27) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 27) - base) >> (28 - 16); + tmp = (*(in + 27) - base) >> (28 - 16); tmp |= (*(in + 28) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 28) - base) >> (28 - 12); + tmp = (*(in + 28) - base) >> (28 - 12); tmp |= (*(in + 29) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 29) - base) >> (28 - 8); + tmp = (*(in + 29) - base) >> (28 - 8); tmp |= (*(in + 30) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 108) */ - tmp = (*(in + 30) - base) >> (28 - 4); + tmp = (*(in + 30) - base) >> (28 - 4); tmp |= (*(in + 31) - base) << 4; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -5111,11 +5078,11 @@ pack28_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 112; } -static uint32_t -unpack28_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack28_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 268435455); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 268435455); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ @@ -5146,10 +5113,10 @@ unpack28_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 4)) << (28 - 4); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 4) & 268435455); + *(out + 7) = base + ((*in32 >> 4) & 268435455); in32++; /* consumed: 4 bytes (total: 32) */ - *(out + 8) = base + ((*in32 >> 0) & 268435455); + *(out + 8) = base + ((*in32 >> 0) & 268435455); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ @@ -5180,10 +5147,10 @@ unpack28_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 4)) << (28 - 4); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 4) & 268435455); + *(out + 15) = base + ((*in32 >> 4) & 268435455); in32++; /* consumed: 4 bytes (total: 60) */ - *(out + 16) = base + ((*in32 >> 0) & 268435455); + *(out + 16) = base + ((*in32 >> 0) & 268435455); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 64) */ @@ -5214,10 +5181,10 @@ unpack28_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 84) */ tmp |= (*in32 % (1U << 4)) << (28 - 4); *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 4) & 268435455); + *(out + 23) = base + ((*in32 >> 4) & 268435455); in32++; /* consumed: 4 bytes (total: 88) */ - *(out + 24) = base + ((*in32 >> 0) & 268435455); + *(out + 24) = base + ((*in32 >> 0) & 268435455); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 92) */ @@ -5248,157 +5215,156 @@ unpack28_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 112) */ tmp |= (*in32 % (1U << 4)) << (28 - 4); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 4) & 268435455); + *(out + 31) = base + ((*in32 >> 4) & 268435455); /* remaining: 0 bits */ return 112; } -static uint32_t -pack29_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack29_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (29 - 26); + tmp = (*(in + 1) - base) >> (29 - 26); tmp |= (*(in + 2) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (29 - 23); + tmp = (*(in + 2) - base) >> (29 - 23); tmp |= (*(in + 3) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (29 - 20); + tmp = (*(in + 3) - base) >> (29 - 20); tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (29 - 17); + tmp = (*(in + 4) - base) >> (29 - 17); tmp |= (*(in + 5) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (29 - 14); + tmp = (*(in + 5) - base) >> (29 - 14); tmp |= (*(in + 6) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (29 - 11); + tmp = (*(in + 6) - base) >> (29 - 11); tmp |= (*(in + 7) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (29 - 8); + tmp = (*(in + 7) - base) >> (29 - 8); tmp |= (*(in + 8) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (29 - 5); + tmp = (*(in + 8) - base) >> (29 - 5); tmp |= (*(in + 9) - base) << 5; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (29 - 2); + tmp = (*(in + 9) - base) >> (29 - 2); tmp |= (*(in + 10) - base) << 2; tmp |= (*(in + 11) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (29 - 28); + tmp = (*(in + 11) - base) >> (29 - 28); tmp |= (*(in + 12) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 12) - base) >> (29 - 25); + tmp = (*(in + 12) - base) >> (29 - 25); tmp |= (*(in + 13) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 13) - base) >> (29 - 22); + tmp = (*(in + 13) - base) >> (29 - 22); tmp |= (*(in + 14) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 14) - base) >> (29 - 19); + tmp = (*(in + 14) - base) >> (29 - 19); tmp |= (*(in + 15) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 15) - base) >> (29 - 16); + tmp = (*(in + 15) - base) >> (29 - 16); tmp |= (*(in + 16) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 16) - base) >> (29 - 13); + tmp = (*(in + 16) - base) >> (29 - 13); tmp |= (*(in + 17) - base) << 13; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 17) - base) >> (29 - 10); + tmp = (*(in + 17) - base) >> (29 - 10); tmp |= (*(in + 18) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 18) - base) >> (29 - 7); + tmp = (*(in + 18) - base) >> (29 - 7); tmp |= (*(in + 19) - base) << 7; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 19) - base) >> (29 - 4); + tmp = (*(in + 19) - base) >> (29 - 4); tmp |= (*(in + 20) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 20) - base) >> (29 - 1); + tmp = (*(in + 20) - base) >> (29 - 1); tmp |= (*(in + 21) - base) << 1; tmp |= (*(in + 22) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 22) - base) >> (29 - 27); + tmp = (*(in + 22) - base) >> (29 - 27); tmp |= (*(in + 23) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 23) - base) >> (29 - 24); + tmp = (*(in + 23) - base) >> (29 - 24); tmp |= (*(in + 24) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 24) - base) >> (29 - 21); + tmp = (*(in + 24) - base) >> (29 - 21); tmp |= (*(in + 25) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 25) - base) >> (29 - 18); + tmp = (*(in + 25) - base) >> (29 - 18); tmp |= (*(in + 26) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 26) - base) >> (29 - 15); + tmp = (*(in + 26) - base) >> (29 - 15); tmp |= (*(in + 27) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 27) - base) >> (29 - 12); + tmp = (*(in + 27) - base) >> (29 - 12); tmp |= (*(in + 28) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 28) - base) >> (29 - 9); + tmp = (*(in + 28) - base) >> (29 - 9); tmp |= (*(in + 29) - base) << 9; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 108) */ - tmp = (*(in + 29) - base) >> (29 - 6); + tmp = (*(in + 29) - base) >> (29 - 6); tmp |= (*(in + 30) - base) << 6; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 112) */ - tmp = (*(in + 30) - base) >> (29 - 3); + tmp = (*(in + 30) - base) >> (29 - 3); tmp |= (*(in + 31) - base) << 3; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -5407,11 +5373,11 @@ pack29_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 116; } -static uint32_t -unpack29_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack29_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 536870911); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 536870911); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 8) */ @@ -5457,7 +5423,7 @@ unpack29_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 2)) << (29 - 2); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 2) & 536870911); + *(out + 10) = base + ((*in32 >> 2) & 536870911); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 44) */ @@ -5508,7 +5474,7 @@ unpack29_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 80) */ tmp |= (*in32 % (1U << 1)) << (29 - 1); *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 1) & 536870911); + *(out + 21) = base + ((*in32 >> 1) & 536870911); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 84) */ @@ -5554,160 +5520,159 @@ unpack29_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 116) */ tmp |= (*in32 % (1U << 3)) << (29 - 3); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 3) & 536870911); + *(out + 31) = base + ((*in32 >> 3) & 536870911); /* remaining: 0 bits */ return 116; } -static uint32_t -pack30_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack30_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (30 - 28); + tmp = (*(in + 1) - base) >> (30 - 28); tmp |= (*(in + 2) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (30 - 26); + tmp = (*(in + 2) - base) >> (30 - 26); tmp |= (*(in + 3) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (30 - 24); + tmp = (*(in + 3) - base) >> (30 - 24); tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (30 - 22); + tmp = (*(in + 4) - base) >> (30 - 22); tmp |= (*(in + 5) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (30 - 20); + tmp = (*(in + 5) - base) >> (30 - 20); tmp |= (*(in + 6) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (30 - 18); + tmp = (*(in + 6) - base) >> (30 - 18); tmp |= (*(in + 7) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (30 - 16); + tmp = (*(in + 7) - base) >> (30 - 16); tmp |= (*(in + 8) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (30 - 14); + tmp = (*(in + 8) - base) >> (30 - 14); tmp |= (*(in + 9) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (30 - 12); + tmp = (*(in + 9) - base) >> (30 - 12); tmp |= (*(in + 10) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 10) - base) >> (30 - 10); + tmp = (*(in + 10) - base) >> (30 - 10); tmp |= (*(in + 11) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 11) - base) >> (30 - 8); + tmp = (*(in + 11) - base) >> (30 - 8); tmp |= (*(in + 12) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 12) - base) >> (30 - 6); + tmp = (*(in + 12) - base) >> (30 - 6); tmp |= (*(in + 13) - base) << 6; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 13) - base) >> (30 - 4); + tmp = (*(in + 13) - base) >> (30 - 4); tmp |= (*(in + 14) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 14) - base) >> (30 - 2); + tmp = (*(in + 14) - base) >> (30 - 2); tmp |= (*(in + 15) - base) << 2; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 16) - base) << 0; + tmp = (*(in + 16) - base) << 0; tmp |= (*(in + 17) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 17) - base) >> (30 - 28); + tmp = (*(in + 17) - base) >> (30 - 28); tmp |= (*(in + 18) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 18) - base) >> (30 - 26); + tmp = (*(in + 18) - base) >> (30 - 26); tmp |= (*(in + 19) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 19) - base) >> (30 - 24); + tmp = (*(in + 19) - base) >> (30 - 24); tmp |= (*(in + 20) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 20) - base) >> (30 - 22); + tmp = (*(in + 20) - base) >> (30 - 22); tmp |= (*(in + 21) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 21) - base) >> (30 - 20); + tmp = (*(in + 21) - base) >> (30 - 20); tmp |= (*(in + 22) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 22) - base) >> (30 - 18); + tmp = (*(in + 22) - base) >> (30 - 18); tmp |= (*(in + 23) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 23) - base) >> (30 - 16); + tmp = (*(in + 23) - base) >> (30 - 16); tmp |= (*(in + 24) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 24) - base) >> (30 - 14); + tmp = (*(in + 24) - base) >> (30 - 14); tmp |= (*(in + 25) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 25) - base) >> (30 - 12); + tmp = (*(in + 25) - base) >> (30 - 12); tmp |= (*(in + 26) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 26) - base) >> (30 - 10); + tmp = (*(in + 26) - base) >> (30 - 10); tmp |= (*(in + 27) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 27) - base) >> (30 - 8); + tmp = (*(in + 27) - base) >> (30 - 8); tmp |= (*(in + 28) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 108) */ - tmp = (*(in + 28) - base) >> (30 - 6); + tmp = (*(in + 28) - base) >> (30 - 6); tmp |= (*(in + 29) - base) << 6; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 112) */ - tmp = (*(in + 29) - base) >> (30 - 4); + tmp = (*(in + 29) - base) >> (30 - 4); tmp |= (*(in + 30) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 116) */ - tmp = (*(in + 30) - base) >> (30 - 2); + tmp = (*(in + 30) - base) >> (30 - 2); tmp |= (*(in + 31) - base) << 2; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -5716,11 +5681,11 @@ pack30_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 120; } -static uint32_t -unpack30_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack30_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1073741823); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1073741823); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ @@ -5791,10 +5756,10 @@ unpack30_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 2)) << (30 - 2); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 2) & 1073741823); + *(out + 15) = base + ((*in32 >> 2) & 1073741823); in32++; /* consumed: 4 bytes (total: 64) */ - *(out + 16) = base + ((*in32 >> 0) & 1073741823); + *(out + 16) = base + ((*in32 >> 0) & 1073741823); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 68) */ @@ -5865,165 +5830,164 @@ unpack30_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 120) */ tmp |= (*in32 % (1U << 2)) << (30 - 2); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 2) & 1073741823); + *(out + 31) = base + ((*in32 >> 2) & 1073741823); /* remaining: 0 bits */ return 120; } -static uint32_t -pack31_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack31_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (31 - 30); + tmp = (*(in + 1) - base) >> (31 - 30); tmp |= (*(in + 2) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (31 - 29); + tmp = (*(in + 2) - base) >> (31 - 29); tmp |= (*(in + 3) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (31 - 28); + tmp = (*(in + 3) - base) >> (31 - 28); tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (31 - 27); + tmp = (*(in + 4) - base) >> (31 - 27); tmp |= (*(in + 5) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (31 - 26); + tmp = (*(in + 5) - base) >> (31 - 26); tmp |= (*(in + 6) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (31 - 25); + tmp = (*(in + 6) - base) >> (31 - 25); tmp |= (*(in + 7) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (31 - 24); + tmp = (*(in + 7) - base) >> (31 - 24); tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (31 - 23); + tmp = (*(in + 8) - base) >> (31 - 23); tmp |= (*(in + 9) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (31 - 22); + tmp = (*(in + 9) - base) >> (31 - 22); tmp |= (*(in + 10) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 10) - base) >> (31 - 21); + tmp = (*(in + 10) - base) >> (31 - 21); tmp |= (*(in + 11) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 11) - base) >> (31 - 20); + tmp = (*(in + 11) - base) >> (31 - 20); tmp |= (*(in + 12) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 12) - base) >> (31 - 19); + tmp = (*(in + 12) - base) >> (31 - 19); tmp |= (*(in + 13) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 13) - base) >> (31 - 18); + tmp = (*(in + 13) - base) >> (31 - 18); tmp |= (*(in + 14) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 14) - base) >> (31 - 17); + tmp = (*(in + 14) - base) >> (31 - 17); tmp |= (*(in + 15) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 15) - base) >> (31 - 16); + tmp = (*(in + 15) - base) >> (31 - 16); tmp |= (*(in + 16) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 16) - base) >> (31 - 15); + tmp = (*(in + 16) - base) >> (31 - 15); tmp |= (*(in + 17) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 17) - base) >> (31 - 14); + tmp = (*(in + 17) - base) >> (31 - 14); tmp |= (*(in + 18) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 18) - base) >> (31 - 13); + tmp = (*(in + 18) - base) >> (31 - 13); tmp |= (*(in + 19) - base) << 13; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 19) - base) >> (31 - 12); + tmp = (*(in + 19) - base) >> (31 - 12); tmp |= (*(in + 20) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 20) - base) >> (31 - 11); + tmp = (*(in + 20) - base) >> (31 - 11); tmp |= (*(in + 21) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 21) - base) >> (31 - 10); + tmp = (*(in + 21) - base) >> (31 - 10); tmp |= (*(in + 22) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 22) - base) >> (31 - 9); + tmp = (*(in + 22) - base) >> (31 - 9); tmp |= (*(in + 23) - base) << 9; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 23) - base) >> (31 - 8); + tmp = (*(in + 23) - base) >> (31 - 8); tmp |= (*(in + 24) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 24) - base) >> (31 - 7); + tmp = (*(in + 24) - base) >> (31 - 7); tmp |= (*(in + 25) - base) << 7; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 25) - base) >> (31 - 6); + tmp = (*(in + 25) - base) >> (31 - 6); tmp |= (*(in + 26) - base) << 6; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 26) - base) >> (31 - 5); + tmp = (*(in + 26) - base) >> (31 - 5); tmp |= (*(in + 27) - base) << 5; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 108) */ - tmp = (*(in + 27) - base) >> (31 - 4); + tmp = (*(in + 27) - base) >> (31 - 4); tmp |= (*(in + 28) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 112) */ - tmp = (*(in + 28) - base) >> (31 - 3); + tmp = (*(in + 28) - base) >> (31 - 3); tmp |= (*(in + 29) - base) << 3; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 116) */ - tmp = (*(in + 29) - base) >> (31 - 2); + tmp = (*(in + 29) - base) >> (31 - 2); tmp |= (*(in + 30) - base) << 2; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 120) */ - tmp = (*(in + 30) - base) >> (31 - 1); + tmp = (*(in + 30) - base) >> (31 - 1); tmp |= (*(in + 31) - base) << 1; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -6032,11 +5996,11 @@ pack31_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 124; } -static uint32_t -unpack31_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack31_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2147483647); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2147483647); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 8) */ @@ -6187,13 +6151,12 @@ unpack31_32(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 124) */ tmp |= (*in32 % (1U << 1)) << (31 - 1); *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 1) & 2147483647); + *(out + 31) = base + ((*in32 >> 1) & 2147483647); /* remaining: 0 bits */ return 124; } -static uint32_t -pack32_32(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack32_32(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t i; uint32_t *out32 = (uint32_t *)out; for (i = 0; i < 32; i++) @@ -6201,8 +6164,7 @@ pack32_32(uint32_t base, const uint32_t *in, uint8_t *out) { return 32 * sizeof(uint32_t); } -static uint32_t -unpack32_32(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack32_32(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t i; uint32_t *in32 = (uint32_t *)in; for (i = 0; i < 32; i++) @@ -6211,81 +6173,24 @@ unpack32_32(uint32_t base, const uint8_t *in, uint32_t *out) { } for_packfunc_t for_pack32[33] = { - pack0_n, - pack1_32, - pack2_32, - pack3_32, - pack4_32, - pack5_32, - pack6_32, - pack7_32, - pack8_32, - pack9_32, - pack10_32, - pack11_32, - pack12_32, - pack13_32, - pack14_32, - pack15_32, - pack16_32, - pack17_32, - pack18_32, - pack19_32, - pack20_32, - pack21_32, - pack22_32, - pack23_32, - pack24_32, - pack25_32, - pack26_32, - pack27_32, - pack28_32, - pack29_32, - pack30_32, - pack31_32, - pack32_32 -}; + pack0_n, pack1_32, pack2_32, pack3_32, pack4_32, pack5_32, pack6_32, + pack7_32, pack8_32, pack9_32, pack10_32, pack11_32, pack12_32, pack13_32, + pack14_32, pack15_32, pack16_32, pack17_32, pack18_32, pack19_32, pack20_32, + pack21_32, pack22_32, pack23_32, pack24_32, pack25_32, pack26_32, pack27_32, + pack28_32, pack29_32, pack30_32, pack31_32, pack32_32}; for_unpackfunc_t for_unpack32[33] = { - unpack0_32, - unpack1_32, - unpack2_32, - unpack3_32, - unpack4_32, - unpack5_32, - unpack6_32, - unpack7_32, - unpack8_32, - unpack9_32, - unpack10_32, - unpack11_32, - unpack12_32, - unpack13_32, - unpack14_32, - unpack15_32, - unpack16_32, - unpack17_32, - unpack18_32, - unpack19_32, - unpack20_32, - unpack21_32, - unpack22_32, - unpack23_32, - unpack24_32, - unpack25_32, - unpack26_32, - unpack27_32, - unpack28_32, - unpack29_32, - unpack30_32, - unpack31_32, - unpack32_32 -}; - -static uint32_t -pack1_16(uint32_t base, const uint32_t *in, uint8_t *out) { + unpack0_32, unpack1_32, unpack2_32, unpack3_32, unpack4_32, + unpack5_32, unpack6_32, unpack7_32, unpack8_32, unpack9_32, + unpack10_32, unpack11_32, unpack12_32, unpack13_32, unpack14_32, + unpack15_32, unpack16_32, unpack17_32, unpack18_32, unpack19_32, + unpack20_32, unpack21_32, unpack22_32, unpack23_32, unpack24_32, + unpack25_32, unpack26_32, unpack27_32, unpack28_32, unpack29_32, + unpack30_32, unpack31_32, unpack32_32}; + +static uint32_t pack1_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 1; tmp |= (*(in + 2) - base) << 2; tmp |= (*(in + 3) - base) << 3; @@ -6308,34 +6213,33 @@ pack1_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 2; } -static uint32_t -unpack1_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack1_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1); - *(out + 1) = base + ((*in32 >> 1) & 1); - *(out + 2) = base + ((*in32 >> 2) & 1); - *(out + 3) = base + ((*in32 >> 3) & 1); - *(out + 4) = base + ((*in32 >> 4) & 1); - *(out + 5) = base + ((*in32 >> 5) & 1); - *(out + 6) = base + ((*in32 >> 6) & 1); - *(out + 7) = base + ((*in32 >> 7) & 1); - *(out + 8) = base + ((*in32 >> 8) & 1); - *(out + 9) = base + ((*in32 >> 9) & 1); - *(out + 10) = base + ((*in32 >> 10) & 1); - *(out + 11) = base + ((*in32 >> 11) & 1); - *(out + 12) = base + ((*in32 >> 12) & 1); - *(out + 13) = base + ((*in32 >> 13) & 1); - *(out + 14) = base + ((*in32 >> 14) & 1); - *(out + 15) = base + ((*in32 >> 15) & 1); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1); + *(out + 1) = base + ((*in32 >> 1) & 1); + *(out + 2) = base + ((*in32 >> 2) & 1); + *(out + 3) = base + ((*in32 >> 3) & 1); + *(out + 4) = base + ((*in32 >> 4) & 1); + *(out + 5) = base + ((*in32 >> 5) & 1); + *(out + 6) = base + ((*in32 >> 6) & 1); + *(out + 7) = base + ((*in32 >> 7) & 1); + *(out + 8) = base + ((*in32 >> 8) & 1); + *(out + 9) = base + ((*in32 >> 9) & 1); + *(out + 10) = base + ((*in32 >> 10) & 1); + *(out + 11) = base + ((*in32 >> 11) & 1); + *(out + 12) = base + ((*in32 >> 12) & 1); + *(out + 13) = base + ((*in32 >> 13) & 1); + *(out + 14) = base + ((*in32 >> 14) & 1); + *(out + 15) = base + ((*in32 >> 15) & 1); /* remaining: 16 bits */ return 2; } -static uint32_t -pack2_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack2_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 2; tmp |= (*(in + 2) - base) << 4; tmp |= (*(in + 3) - base) << 6; @@ -6358,34 +6262,33 @@ pack2_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 4; } -static uint32_t -unpack2_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack2_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 3); - *(out + 1) = base + ((*in32 >> 2) & 3); - *(out + 2) = base + ((*in32 >> 4) & 3); - *(out + 3) = base + ((*in32 >> 6) & 3); - *(out + 4) = base + ((*in32 >> 8) & 3); - *(out + 5) = base + ((*in32 >> 10) & 3); - *(out + 6) = base + ((*in32 >> 12) & 3); - *(out + 7) = base + ((*in32 >> 14) & 3); - *(out + 8) = base + ((*in32 >> 16) & 3); - *(out + 9) = base + ((*in32 >> 18) & 3); - *(out + 10) = base + ((*in32 >> 20) & 3); - *(out + 11) = base + ((*in32 >> 22) & 3); - *(out + 12) = base + ((*in32 >> 24) & 3); - *(out + 13) = base + ((*in32 >> 26) & 3); - *(out + 14) = base + ((*in32 >> 28) & 3); - *(out + 15) = base + ((*in32 >> 30) & 3); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 3); + *(out + 1) = base + ((*in32 >> 2) & 3); + *(out + 2) = base + ((*in32 >> 4) & 3); + *(out + 3) = base + ((*in32 >> 6) & 3); + *(out + 4) = base + ((*in32 >> 8) & 3); + *(out + 5) = base + ((*in32 >> 10) & 3); + *(out + 6) = base + ((*in32 >> 12) & 3); + *(out + 7) = base + ((*in32 >> 14) & 3); + *(out + 8) = base + ((*in32 >> 16) & 3); + *(out + 9) = base + ((*in32 >> 18) & 3); + *(out + 10) = base + ((*in32 >> 20) & 3); + *(out + 11) = base + ((*in32 >> 22) & 3); + *(out + 12) = base + ((*in32 >> 24) & 3); + *(out + 13) = base + ((*in32 >> 26) & 3); + *(out + 14) = base + ((*in32 >> 28) & 3); + *(out + 15) = base + ((*in32 >> 30) & 3); /* remaining: 0 bits */ return 4; } -static uint32_t -pack3_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack3_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 3; tmp |= (*(in + 2) - base) << 6; tmp |= (*(in + 3) - base) << 9; @@ -6399,7 +6302,7 @@ pack3_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 10) - base) >> (3 - 1); + tmp = (*(in + 10) - base) >> (3 - 1); tmp |= (*(in + 11) - base) << 1; tmp |= (*(in + 12) - base) << 4; tmp |= (*(in + 13) - base) << 7; @@ -6412,38 +6315,37 @@ pack3_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 6; } -static uint32_t -unpack3_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack3_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 7); - *(out + 1) = base + ((*in32 >> 3) & 7); - *(out + 2) = base + ((*in32 >> 6) & 7); - *(out + 3) = base + ((*in32 >> 9) & 7); - *(out + 4) = base + ((*in32 >> 12) & 7); - *(out + 5) = base + ((*in32 >> 15) & 7); - *(out + 6) = base + ((*in32 >> 18) & 7); - *(out + 7) = base + ((*in32 >> 21) & 7); - *(out + 8) = base + ((*in32 >> 24) & 7); - *(out + 9) = base + ((*in32 >> 27) & 7); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 7); + *(out + 1) = base + ((*in32 >> 3) & 7); + *(out + 2) = base + ((*in32 >> 6) & 7); + *(out + 3) = base + ((*in32 >> 9) & 7); + *(out + 4) = base + ((*in32 >> 12) & 7); + *(out + 5) = base + ((*in32 >> 15) & 7); + *(out + 6) = base + ((*in32 >> 18) & 7); + *(out + 7) = base + ((*in32 >> 21) & 7); + *(out + 8) = base + ((*in32 >> 24) & 7); + *(out + 9) = base + ((*in32 >> 27) & 7); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 1)) << (3 - 1); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 1) & 7); - *(out + 12) = base + ((*in32 >> 4) & 7); - *(out + 13) = base + ((*in32 >> 7) & 7); - *(out + 14) = base + ((*in32 >> 10) & 7); - *(out + 15) = base + ((*in32 >> 13) & 7); + *(out + 11) = base + ((*in32 >> 1) & 7); + *(out + 12) = base + ((*in32 >> 4) & 7); + *(out + 13) = base + ((*in32 >> 7) & 7); + *(out + 14) = base + ((*in32 >> 10) & 7); + *(out + 15) = base + ((*in32 >> 13) & 7); /* remaining: 16 bits */ return 6; } -static uint32_t -pack4_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack4_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 4; tmp |= (*(in + 2) - base) << 8; tmp |= (*(in + 3) - base) << 12; @@ -6454,7 +6356,7 @@ pack4_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 4; tmp |= (*(in + 10) - base) << 8; tmp |= (*(in + 11) - base) << 12; @@ -6469,36 +6371,35 @@ pack4_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 8; } -static uint32_t -unpack4_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack4_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 15); - *(out + 1) = base + ((*in32 >> 4) & 15); - *(out + 2) = base + ((*in32 >> 8) & 15); - *(out + 3) = base + ((*in32 >> 12) & 15); - *(out + 4) = base + ((*in32 >> 16) & 15); - *(out + 5) = base + ((*in32 >> 20) & 15); - *(out + 6) = base + ((*in32 >> 24) & 15); - *(out + 7) = base + ((*in32 >> 28) & 15); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 15); + *(out + 1) = base + ((*in32 >> 4) & 15); + *(out + 2) = base + ((*in32 >> 8) & 15); + *(out + 3) = base + ((*in32 >> 12) & 15); + *(out + 4) = base + ((*in32 >> 16) & 15); + *(out + 5) = base + ((*in32 >> 20) & 15); + *(out + 6) = base + ((*in32 >> 24) & 15); + *(out + 7) = base + ((*in32 >> 28) & 15); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 8) = base + ((*in32 >> 0) & 15); - *(out + 9) = base + ((*in32 >> 4) & 15); - *(out + 10) = base + ((*in32 >> 8) & 15); - *(out + 11) = base + ((*in32 >> 12) & 15); - *(out + 12) = base + ((*in32 >> 16) & 15); - *(out + 13) = base + ((*in32 >> 20) & 15); - *(out + 14) = base + ((*in32 >> 24) & 15); - *(out + 15) = base + ((*in32 >> 28) & 15); + *(out + 8) = base + ((*in32 >> 0) & 15); + *(out + 9) = base + ((*in32 >> 4) & 15); + *(out + 10) = base + ((*in32 >> 8) & 15); + *(out + 11) = base + ((*in32 >> 12) & 15); + *(out + 12) = base + ((*in32 >> 16) & 15); + *(out + 13) = base + ((*in32 >> 20) & 15); + *(out + 14) = base + ((*in32 >> 24) & 15); + *(out + 15) = base + ((*in32 >> 28) & 15); /* remaining: 0 bits */ return 8; } -static uint32_t -pack5_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack5_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 5; tmp |= (*(in + 2) - base) << 10; tmp |= (*(in + 3) - base) << 15; @@ -6508,7 +6409,7 @@ pack5_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 6) - base) >> (5 - 3); + tmp = (*(in + 6) - base) >> (5 - 3); tmp |= (*(in + 7) - base) << 3; tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 13; @@ -6518,7 +6419,7 @@ pack5_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 12) - base) >> (5 - 1); + tmp = (*(in + 12) - base) >> (5 - 1); tmp |= (*(in + 13) - base) << 1; tmp |= (*(in + 14) - base) << 6; tmp |= (*(in + 15) - base) << 11; @@ -6529,42 +6430,41 @@ pack5_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 10; } -static uint32_t -unpack5_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack5_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 31); - *(out + 1) = base + ((*in32 >> 5) & 31); - *(out + 2) = base + ((*in32 >> 10) & 31); - *(out + 3) = base + ((*in32 >> 15) & 31); - *(out + 4) = base + ((*in32 >> 20) & 31); - *(out + 5) = base + ((*in32 >> 25) & 31); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 31); + *(out + 1) = base + ((*in32 >> 5) & 31); + *(out + 2) = base + ((*in32 >> 10) & 31); + *(out + 3) = base + ((*in32 >> 15) & 31); + *(out + 4) = base + ((*in32 >> 20) & 31); + *(out + 5) = base + ((*in32 >> 25) & 31); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 3)) << (5 - 3); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 3) & 31); - *(out + 8) = base + ((*in32 >> 8) & 31); - *(out + 9) = base + ((*in32 >> 13) & 31); - *(out + 10) = base + ((*in32 >> 18) & 31); - *(out + 11) = base + ((*in32 >> 23) & 31); + *(out + 7) = base + ((*in32 >> 3) & 31); + *(out + 8) = base + ((*in32 >> 8) & 31); + *(out + 9) = base + ((*in32 >> 13) & 31); + *(out + 10) = base + ((*in32 >> 18) & 31); + *(out + 11) = base + ((*in32 >> 23) & 31); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 1)) << (5 - 1); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 1) & 31); - *(out + 14) = base + ((*in32 >> 6) & 31); - *(out + 15) = base + ((*in32 >> 11) & 31); + *(out + 13) = base + ((*in32 >> 1) & 31); + *(out + 14) = base + ((*in32 >> 6) & 31); + *(out + 15) = base + ((*in32 >> 11) & 31); /* remaining: 16 bits */ return 10; } -static uint32_t -pack6_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack6_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 6; tmp |= (*(in + 2) - base) << 12; tmp |= (*(in + 3) - base) << 18; @@ -6573,7 +6473,7 @@ pack6_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 5) - base) >> (6 - 4); + tmp = (*(in + 5) - base) >> (6 - 4); tmp |= (*(in + 6) - base) << 4; tmp |= (*(in + 7) - base) << 10; tmp |= (*(in + 8) - base) << 16; @@ -6582,7 +6482,7 @@ pack6_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 10) - base) >> (6 - 2); + tmp = (*(in + 10) - base) >> (6 - 2); tmp |= (*(in + 11) - base) << 2; tmp |= (*(in + 12) - base) << 8; tmp |= (*(in + 13) - base) << 14; @@ -6595,42 +6495,41 @@ pack6_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 12; } -static uint32_t -unpack6_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack6_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 63); - *(out + 1) = base + ((*in32 >> 6) & 63); - *(out + 2) = base + ((*in32 >> 12) & 63); - *(out + 3) = base + ((*in32 >> 18) & 63); - *(out + 4) = base + ((*in32 >> 24) & 63); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 63); + *(out + 1) = base + ((*in32 >> 6) & 63); + *(out + 2) = base + ((*in32 >> 12) & 63); + *(out + 3) = base + ((*in32 >> 18) & 63); + *(out + 4) = base + ((*in32 >> 24) & 63); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (6 - 4); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 63); - *(out + 7) = base + ((*in32 >> 10) & 63); - *(out + 8) = base + ((*in32 >> 16) & 63); - *(out + 9) = base + ((*in32 >> 22) & 63); + *(out + 6) = base + ((*in32 >> 4) & 63); + *(out + 7) = base + ((*in32 >> 10) & 63); + *(out + 8) = base + ((*in32 >> 16) & 63); + *(out + 9) = base + ((*in32 >> 22) & 63); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (6 - 2); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 2) & 63); - *(out + 12) = base + ((*in32 >> 8) & 63); - *(out + 13) = base + ((*in32 >> 14) & 63); - *(out + 14) = base + ((*in32 >> 20) & 63); - *(out + 15) = base + ((*in32 >> 26) & 63); + *(out + 11) = base + ((*in32 >> 2) & 63); + *(out + 12) = base + ((*in32 >> 8) & 63); + *(out + 13) = base + ((*in32 >> 14) & 63); + *(out + 14) = base + ((*in32 >> 20) & 63); + *(out + 15) = base + ((*in32 >> 26) & 63); /* remaining: 0 bits */ return 12; } -static uint32_t -pack7_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack7_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 7; tmp |= (*(in + 2) - base) << 14; tmp |= (*(in + 3) - base) << 21; @@ -6638,7 +6537,7 @@ pack7_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) >> (7 - 3); + tmp = (*(in + 4) - base) >> (7 - 3); tmp |= (*(in + 5) - base) << 3; tmp |= (*(in + 6) - base) << 10; tmp |= (*(in + 7) - base) << 17; @@ -6647,7 +6546,7 @@ pack7_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 9) - base) >> (7 - 6); + tmp = (*(in + 9) - base) >> (7 - 6); tmp |= (*(in + 10) - base) << 6; tmp |= (*(in + 11) - base) << 13; tmp |= (*(in + 12) - base) << 20; @@ -6655,7 +6554,7 @@ pack7_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 13) - base) >> (7 - 2); + tmp = (*(in + 13) - base) >> (7 - 2); tmp |= (*(in + 14) - base) << 2; tmp |= (*(in + 15) - base) << 9; /* remaining: 16 bits */ @@ -6665,67 +6564,66 @@ pack7_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 14; } -static uint32_t -unpack7_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack7_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 127); - *(out + 1) = base + ((*in32 >> 7) & 127); - *(out + 2) = base + ((*in32 >> 14) & 127); - *(out + 3) = base + ((*in32 >> 21) & 127); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 127); + *(out + 1) = base + ((*in32 >> 7) & 127); + *(out + 2) = base + ((*in32 >> 14) & 127); + *(out + 3) = base + ((*in32 >> 21) & 127); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 3)) << (7 - 3); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 3) & 127); - *(out + 6) = base + ((*in32 >> 10) & 127); - *(out + 7) = base + ((*in32 >> 17) & 127); - *(out + 8) = base + ((*in32 >> 24) & 127); + *(out + 5) = base + ((*in32 >> 3) & 127); + *(out + 6) = base + ((*in32 >> 10) & 127); + *(out + 7) = base + ((*in32 >> 17) & 127); + *(out + 8) = base + ((*in32 >> 24) & 127); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 6)) << (7 - 6); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 6) & 127); - *(out + 11) = base + ((*in32 >> 13) & 127); - *(out + 12) = base + ((*in32 >> 20) & 127); + *(out + 10) = base + ((*in32 >> 6) & 127); + *(out + 11) = base + ((*in32 >> 13) & 127); + *(out + 12) = base + ((*in32 >> 20) & 127); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 2)) << (7 - 2); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 2) & 127); - *(out + 15) = base + ((*in32 >> 9) & 127); + *(out + 14) = base + ((*in32 >> 2) & 127); + *(out + 15) = base + ((*in32 >> 9) & 127); /* remaining: 16 bits */ return 14; } -static uint32_t -pack8_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack8_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 8; tmp |= (*(in + 2) - base) << 16; tmp |= (*(in + 3) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 8; tmp |= (*(in + 6) - base) << 16; tmp |= (*(in + 7) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 8; tmp |= (*(in + 10) - base) << 16; tmp |= (*(in + 11) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 12) - base) << 0; + tmp = (*(in + 12) - base) << 0; tmp |= (*(in + 13) - base) << 8; tmp |= (*(in + 14) - base) << 16; tmp |= (*(in + 15) - base) << 24; @@ -6736,47 +6634,46 @@ pack8_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 16; } -static uint32_t -unpack8_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack8_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 255); - *(out + 1) = base + ((*in32 >> 8) & 255); - *(out + 2) = base + ((*in32 >> 16) & 255); - *(out + 3) = base + ((*in32 >> 24) & 255); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 255); + *(out + 1) = base + ((*in32 >> 8) & 255); + *(out + 2) = base + ((*in32 >> 16) & 255); + *(out + 3) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 4) = base + ((*in32 >> 0) & 255); - *(out + 5) = base + ((*in32 >> 8) & 255); - *(out + 6) = base + ((*in32 >> 16) & 255); - *(out + 7) = base + ((*in32 >> 24) & 255); + *(out + 4) = base + ((*in32 >> 0) & 255); + *(out + 5) = base + ((*in32 >> 8) & 255); + *(out + 6) = base + ((*in32 >> 16) & 255); + *(out + 7) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 12) */ - *(out + 8) = base + ((*in32 >> 0) & 255); - *(out + 9) = base + ((*in32 >> 8) & 255); - *(out + 10) = base + ((*in32 >> 16) & 255); - *(out + 11) = base + ((*in32 >> 24) & 255); + *(out + 8) = base + ((*in32 >> 0) & 255); + *(out + 9) = base + ((*in32 >> 8) & 255); + *(out + 10) = base + ((*in32 >> 16) & 255); + *(out + 11) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 12) = base + ((*in32 >> 0) & 255); - *(out + 13) = base + ((*in32 >> 8) & 255); - *(out + 14) = base + ((*in32 >> 16) & 255); - *(out + 15) = base + ((*in32 >> 24) & 255); + *(out + 12) = base + ((*in32 >> 0) & 255); + *(out + 13) = base + ((*in32 >> 8) & 255); + *(out + 14) = base + ((*in32 >> 16) & 255); + *(out + 15) = base + ((*in32 >> 24) & 255); /* remaining: 0 bits */ return 16; } -static uint32_t -pack9_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack9_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 9; tmp |= (*(in + 2) - base) << 18; tmp |= (*(in + 3) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (9 - 4); + tmp = (*(in + 3) - base) >> (9 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 13; tmp |= (*(in + 6) - base) << 22; @@ -6784,14 +6681,14 @@ pack9_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 7) - base) >> (9 - 8); + tmp = (*(in + 7) - base) >> (9 - 8); tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 17; tmp |= (*(in + 10) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 10) - base) >> (9 - 3); + tmp = (*(in + 10) - base) >> (9 - 3); tmp |= (*(in + 11) - base) << 3; tmp |= (*(in + 12) - base) << 12; tmp |= (*(in + 13) - base) << 21; @@ -6799,7 +6696,7 @@ pack9_16(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 14) - base) >> (9 - 7); + tmp = (*(in + 14) - base) >> (9 - 7); tmp |= (*(in + 15) - base) << 7; /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; @@ -6808,78 +6705,77 @@ pack9_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 18; } -static uint32_t -unpack9_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack9_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 511); - *(out + 1) = base + ((*in32 >> 9) & 511); - *(out + 2) = base + ((*in32 >> 18) & 511); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 511); + *(out + 1) = base + ((*in32 >> 9) & 511); + *(out + 2) = base + ((*in32 >> 18) & 511); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (9 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 511); - *(out + 5) = base + ((*in32 >> 13) & 511); - *(out + 6) = base + ((*in32 >> 22) & 511); + *(out + 4) = base + ((*in32 >> 4) & 511); + *(out + 5) = base + ((*in32 >> 13) & 511); + *(out + 6) = base + ((*in32 >> 22) & 511); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (9 - 8); *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 511); - *(out + 9) = base + ((*in32 >> 17) & 511); + *(out + 8) = base + ((*in32 >> 8) & 511); + *(out + 9) = base + ((*in32 >> 17) & 511); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 3)) << (9 - 3); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 3) & 511); - *(out + 12) = base + ((*in32 >> 12) & 511); - *(out + 13) = base + ((*in32 >> 21) & 511); + *(out + 11) = base + ((*in32 >> 3) & 511); + *(out + 12) = base + ((*in32 >> 12) & 511); + *(out + 13) = base + ((*in32 >> 21) & 511); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 7)) << (9 - 7); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 7) & 511); + *(out + 15) = base + ((*in32 >> 7) & 511); /* remaining: 16 bits */ return 18; } -static uint32_t -pack10_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack10_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 10; tmp |= (*(in + 2) - base) << 20; tmp |= (*(in + 3) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (10 - 8); + tmp = (*(in + 3) - base) >> (10 - 8); tmp |= (*(in + 4) - base) << 8; tmp |= (*(in + 5) - base) << 18; tmp |= (*(in + 6) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 6) - base) >> (10 - 6); + tmp = (*(in + 6) - base) >> (10 - 6); tmp |= (*(in + 7) - base) << 6; tmp |= (*(in + 8) - base) << 16; tmp |= (*(in + 9) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 9) - base) >> (10 - 4); + tmp = (*(in + 9) - base) >> (10 - 4); tmp |= (*(in + 10) - base) << 4; tmp |= (*(in + 11) - base) << 14; tmp |= (*(in + 12) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 12) - base) >> (10 - 2); + tmp = (*(in + 12) - base) >> (10 - 2); tmp |= (*(in + 13) - base) << 2; tmp |= (*(in + 14) - base) << 12; tmp |= (*(in + 15) - base) << 22; @@ -6890,84 +6786,83 @@ pack10_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 20; } -static uint32_t -unpack10_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack10_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1023); - *(out + 1) = base + ((*in32 >> 10) & 1023); - *(out + 2) = base + ((*in32 >> 20) & 1023); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1023); + *(out + 1) = base + ((*in32 >> 10) & 1023); + *(out + 2) = base + ((*in32 >> 20) & 1023); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 8)) << (10 - 8); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 1023); - *(out + 5) = base + ((*in32 >> 18) & 1023); + *(out + 4) = base + ((*in32 >> 8) & 1023); + *(out + 5) = base + ((*in32 >> 18) & 1023); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 6)) << (10 - 6); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 6) & 1023); - *(out + 8) = base + ((*in32 >> 16) & 1023); + *(out + 7) = base + ((*in32 >> 6) & 1023); + *(out + 8) = base + ((*in32 >> 16) & 1023); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (10 - 4); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 4) & 1023); - *(out + 11) = base + ((*in32 >> 14) & 1023); + *(out + 10) = base + ((*in32 >> 4) & 1023); + *(out + 11) = base + ((*in32 >> 14) & 1023); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 2)) << (10 - 2); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 2) & 1023); - *(out + 14) = base + ((*in32 >> 12) & 1023); - *(out + 15) = base + ((*in32 >> 22) & 1023); + *(out + 13) = base + ((*in32 >> 2) & 1023); + *(out + 14) = base + ((*in32 >> 12) & 1023); + *(out + 15) = base + ((*in32 >> 22) & 1023); /* remaining: 0 bits */ return 20; } -static uint32_t -pack11_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack11_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 11; tmp |= (*(in + 2) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (11 - 1); + tmp = (*(in + 2) - base) >> (11 - 1); tmp |= (*(in + 3) - base) << 1; tmp |= (*(in + 4) - base) << 12; tmp |= (*(in + 5) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (11 - 2); + tmp = (*(in + 5) - base) >> (11 - 2); tmp |= (*(in + 6) - base) << 2; tmp |= (*(in + 7) - base) << 13; tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 8) - base) >> (11 - 3); + tmp = (*(in + 8) - base) >> (11 - 3); tmp |= (*(in + 9) - base) << 3; tmp |= (*(in + 10) - base) << 14; tmp |= (*(in + 11) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 11) - base) >> (11 - 4); + tmp = (*(in + 11) - base) >> (11 - 4); tmp |= (*(in + 12) - base) << 4; tmp |= (*(in + 13) - base) << 15; tmp |= (*(in + 14) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 14) - base) >> (11 - 5); + tmp = (*(in + 14) - base) >> (11 - 5); tmp |= (*(in + 15) - base) << 5; /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; @@ -6976,86 +6871,85 @@ pack11_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 22; } -static uint32_t -unpack11_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack11_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2047); - *(out + 1) = base + ((*in32 >> 11) & 2047); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2047); + *(out + 1) = base + ((*in32 >> 11) & 2047); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 1)) << (11 - 1); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 1) & 2047); - *(out + 4) = base + ((*in32 >> 12) & 2047); + *(out + 3) = base + ((*in32 >> 1) & 2047); + *(out + 4) = base + ((*in32 >> 12) & 2047); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (11 - 2); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 2047); - *(out + 7) = base + ((*in32 >> 13) & 2047); + *(out + 6) = base + ((*in32 >> 2) & 2047); + *(out + 7) = base + ((*in32 >> 13) & 2047); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 3)) << (11 - 3); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 3) & 2047); - *(out + 10) = base + ((*in32 >> 14) & 2047); + *(out + 9) = base + ((*in32 >> 3) & 2047); + *(out + 10) = base + ((*in32 >> 14) & 2047); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 4)) << (11 - 4); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 2047); - *(out + 13) = base + ((*in32 >> 15) & 2047); + *(out + 12) = base + ((*in32 >> 4) & 2047); + *(out + 13) = base + ((*in32 >> 15) & 2047); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 5)) << (11 - 5); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 5) & 2047); + *(out + 15) = base + ((*in32 >> 5) & 2047); /* remaining: 16 bits */ return 22; } -static uint32_t -pack12_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack12_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 12; tmp |= (*(in + 2) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (12 - 4); + tmp = (*(in + 2) - base) >> (12 - 4); tmp |= (*(in + 3) - base) << 4; tmp |= (*(in + 4) - base) << 16; tmp |= (*(in + 5) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (12 - 8); + tmp = (*(in + 5) - base) >> (12 - 8); tmp |= (*(in + 6) - base) << 8; tmp |= (*(in + 7) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 12; tmp |= (*(in + 10) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 10) - base) >> (12 - 4); + tmp = (*(in + 10) - base) >> (12 - 4); tmp |= (*(in + 11) - base) << 4; tmp |= (*(in + 12) - base) << 16; tmp |= (*(in + 13) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 13) - base) >> (12 - 8); + tmp = (*(in + 13) - base) >> (12 - 8); tmp |= (*(in + 14) - base) << 8; tmp |= (*(in + 15) - base) << 20; /* remaining: 0 bits */ @@ -7065,90 +6959,89 @@ pack12_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 24; } -static uint32_t -unpack12_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack12_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4095); - *(out + 1) = base + ((*in32 >> 12) & 4095); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 4095); + *(out + 1) = base + ((*in32 >> 12) & 4095); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (12 - 4); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 4) & 4095); - *(out + 4) = base + ((*in32 >> 16) & 4095); + *(out + 3) = base + ((*in32 >> 4) & 4095); + *(out + 4) = base + ((*in32 >> 16) & 4095); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (12 - 8); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 8) & 4095); - *(out + 7) = base + ((*in32 >> 20) & 4095); + *(out + 6) = base + ((*in32 >> 8) & 4095); + *(out + 7) = base + ((*in32 >> 20) & 4095); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 8) = base + ((*in32 >> 0) & 4095); - *(out + 9) = base + ((*in32 >> 12) & 4095); + *(out + 8) = base + ((*in32 >> 0) & 4095); + *(out + 9) = base + ((*in32 >> 12) & 4095); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 4)) << (12 - 4); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 4) & 4095); - *(out + 12) = base + ((*in32 >> 16) & 4095); + *(out + 11) = base + ((*in32 >> 4) & 4095); + *(out + 12) = base + ((*in32 >> 16) & 4095); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (12 - 8); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 8) & 4095); - *(out + 15) = base + ((*in32 >> 20) & 4095); + *(out + 14) = base + ((*in32 >> 8) & 4095); + *(out + 15) = base + ((*in32 >> 20) & 4095); /* remaining: 0 bits */ return 24; } -static uint32_t -pack13_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack13_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 13; tmp |= (*(in + 2) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (13 - 7); + tmp = (*(in + 2) - base) >> (13 - 7); tmp |= (*(in + 3) - base) << 7; tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (13 - 1); + tmp = (*(in + 4) - base) >> (13 - 1); tmp |= (*(in + 5) - base) << 1; tmp |= (*(in + 6) - base) << 14; tmp |= (*(in + 7) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 7) - base) >> (13 - 8); + tmp = (*(in + 7) - base) >> (13 - 8); tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 9) - base) >> (13 - 2); + tmp = (*(in + 9) - base) >> (13 - 2); tmp |= (*(in + 10) - base) << 2; tmp |= (*(in + 11) - base) << 15; tmp |= (*(in + 12) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 12) - base) >> (13 - 9); + tmp = (*(in + 12) - base) >> (13 - 9); tmp |= (*(in + 13) - base) << 9; tmp |= (*(in + 14) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 14) - base) >> (13 - 3); + tmp = (*(in + 14) - base) >> (13 - 3); tmp |= (*(in + 15) - base) << 3; /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; @@ -7157,95 +7050,94 @@ pack13_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 26; } -static uint32_t -unpack13_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack13_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8191); - *(out + 1) = base + ((*in32 >> 13) & 8191); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 8191); + *(out + 1) = base + ((*in32 >> 13) & 8191); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 7)) << (13 - 7); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 7) & 8191); + *(out + 3) = base + ((*in32 >> 7) & 8191); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 1)) << (13 - 1); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 1) & 8191); - *(out + 6) = base + ((*in32 >> 14) & 8191); + *(out + 5) = base + ((*in32 >> 1) & 8191); + *(out + 6) = base + ((*in32 >> 14) & 8191); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 8)) << (13 - 8); *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 8191); + *(out + 8) = base + ((*in32 >> 8) & 8191); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 2)) << (13 - 2); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 2) & 8191); - *(out + 11) = base + ((*in32 >> 15) & 8191); + *(out + 10) = base + ((*in32 >> 2) & 8191); + *(out + 11) = base + ((*in32 >> 15) & 8191); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 9)) << (13 - 9); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 9) & 8191); + *(out + 13) = base + ((*in32 >> 9) & 8191); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 3)) << (13 - 3); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 3) & 8191); + *(out + 15) = base + ((*in32 >> 3) & 8191); /* remaining: 16 bits */ return 26; } -static uint32_t -pack14_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack14_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 14; tmp |= (*(in + 2) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (14 - 10); + tmp = (*(in + 2) - base) >> (14 - 10); tmp |= (*(in + 3) - base) << 10; tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (14 - 6); + tmp = (*(in + 4) - base) >> (14 - 6); tmp |= (*(in + 5) - base) << 6; tmp |= (*(in + 6) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (14 - 2); + tmp = (*(in + 6) - base) >> (14 - 2); tmp |= (*(in + 7) - base) << 2; tmp |= (*(in + 8) - base) << 16; tmp |= (*(in + 9) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 9) - base) >> (14 - 12); + tmp = (*(in + 9) - base) >> (14 - 12); tmp |= (*(in + 10) - base) << 12; tmp |= (*(in + 11) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 11) - base) >> (14 - 8); + tmp = (*(in + 11) - base) >> (14 - 8); tmp |= (*(in + 12) - base) << 8; tmp |= (*(in + 13) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 13) - base) >> (14 - 4); + tmp = (*(in + 13) - base) >> (14 - 4); tmp |= (*(in + 14) - base) << 4; tmp |= (*(in + 15) - base) << 18; /* remaining: 0 bits */ @@ -7255,100 +7147,99 @@ pack14_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 28; } -static uint32_t -unpack14_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack14_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16383); - *(out + 1) = base + ((*in32 >> 14) & 16383); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 16383); + *(out + 1) = base + ((*in32 >> 14) & 16383); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 10)) << (14 - 10); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 10) & 16383); + *(out + 3) = base + ((*in32 >> 10) & 16383); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 6)) << (14 - 6); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 6) & 16383); + *(out + 5) = base + ((*in32 >> 6) & 16383); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 2)) << (14 - 2); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 2) & 16383); - *(out + 8) = base + ((*in32 >> 16) & 16383); + *(out + 7) = base + ((*in32 >> 2) & 16383); + *(out + 8) = base + ((*in32 >> 16) & 16383); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 12)) << (14 - 12); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 12) & 16383); + *(out + 10) = base + ((*in32 >> 12) & 16383); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (14 - 8); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 8) & 16383); + *(out + 12) = base + ((*in32 >> 8) & 16383); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 4)) << (14 - 4); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 4) & 16383); - *(out + 15) = base + ((*in32 >> 18) & 16383); + *(out + 14) = base + ((*in32 >> 4) & 16383); + *(out + 15) = base + ((*in32 >> 18) & 16383); /* remaining: 0 bits */ return 28; } -static uint32_t -pack15_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack15_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 15; tmp |= (*(in + 2) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (15 - 13); + tmp = (*(in + 2) - base) >> (15 - 13); tmp |= (*(in + 3) - base) << 13; tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (15 - 11); + tmp = (*(in + 4) - base) >> (15 - 11); tmp |= (*(in + 5) - base) << 11; tmp |= (*(in + 6) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (15 - 9); + tmp = (*(in + 6) - base) >> (15 - 9); tmp |= (*(in + 7) - base) << 9; tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 8) - base) >> (15 - 7); + tmp = (*(in + 8) - base) >> (15 - 7); tmp |= (*(in + 9) - base) << 7; tmp |= (*(in + 10) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 10) - base) >> (15 - 5); + tmp = (*(in + 10) - base) >> (15 - 5); tmp |= (*(in + 11) - base) << 5; tmp |= (*(in + 12) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 12) - base) >> (15 - 3); + tmp = (*(in + 12) - base) >> (15 - 3); tmp |= (*(in + 13) - base) << 3; tmp |= (*(in + 14) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 14) - base) >> (15 - 1); + tmp = (*(in + 14) - base) >> (15 - 1); tmp |= (*(in + 15) - base) << 1; /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; @@ -7357,97 +7248,96 @@ pack15_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 30; } -static uint32_t -unpack15_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack15_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 32767); - *(out + 1) = base + ((*in32 >> 15) & 32767); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 32767); + *(out + 1) = base + ((*in32 >> 15) & 32767); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 13)) << (15 - 13); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 13) & 32767); + *(out + 3) = base + ((*in32 >> 13) & 32767); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 11)) << (15 - 11); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 11) & 32767); + *(out + 5) = base + ((*in32 >> 11) & 32767); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 9)) << (15 - 9); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 9) & 32767); + *(out + 7) = base + ((*in32 >> 9) & 32767); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 7)) << (15 - 7); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 7) & 32767); + *(out + 9) = base + ((*in32 >> 7) & 32767); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 5)) << (15 - 5); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 5) & 32767); + *(out + 11) = base + ((*in32 >> 5) & 32767); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 3)) << (15 - 3); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 3) & 32767); + *(out + 13) = base + ((*in32 >> 3) & 32767); tmp = (*in32 >> 18); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 1)) << (15 - 1); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 1) & 32767); + *(out + 15) = base + ((*in32 >> 1) & 32767); /* remaining: 16 bits */ return 30; } -static uint32_t -pack16_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack16_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) << 0; + tmp = (*(in + 2) - base) << 0; tmp |= (*(in + 3) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) << 0; + tmp = (*(in + 6) - base) << 0; tmp |= (*(in + 7) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 10) - base) << 0; + tmp = (*(in + 10) - base) << 0; tmp |= (*(in + 11) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 12) - base) << 0; + tmp = (*(in + 12) - base) << 0; tmp |= (*(in + 13) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 14) - base) << 0; + tmp = (*(in + 14) - base) << 0; tmp |= (*(in + 15) - base) << 16; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -7456,95 +7346,94 @@ pack16_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 32; } -static uint32_t -unpack16_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack16_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 65535); - *(out + 1) = base + ((*in32 >> 16) & 65535); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 65535); + *(out + 1) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 2) = base + ((*in32 >> 0) & 65535); - *(out + 3) = base + ((*in32 >> 16) & 65535); + *(out + 2) = base + ((*in32 >> 0) & 65535); + *(out + 3) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 12) */ - *(out + 4) = base + ((*in32 >> 0) & 65535); - *(out + 5) = base + ((*in32 >> 16) & 65535); + *(out + 4) = base + ((*in32 >> 0) & 65535); + *(out + 5) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 6) = base + ((*in32 >> 0) & 65535); - *(out + 7) = base + ((*in32 >> 16) & 65535); + *(out + 6) = base + ((*in32 >> 0) & 65535); + *(out + 7) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 20) */ - *(out + 8) = base + ((*in32 >> 0) & 65535); - *(out + 9) = base + ((*in32 >> 16) & 65535); + *(out + 8) = base + ((*in32 >> 0) & 65535); + *(out + 9) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 24) */ - *(out + 10) = base + ((*in32 >> 0) & 65535); - *(out + 11) = base + ((*in32 >> 16) & 65535); + *(out + 10) = base + ((*in32 >> 0) & 65535); + *(out + 11) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 28) */ - *(out + 12) = base + ((*in32 >> 0) & 65535); - *(out + 13) = base + ((*in32 >> 16) & 65535); + *(out + 12) = base + ((*in32 >> 0) & 65535); + *(out + 13) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 32) */ - *(out + 14) = base + ((*in32 >> 0) & 65535); - *(out + 15) = base + ((*in32 >> 16) & 65535); + *(out + 14) = base + ((*in32 >> 0) & 65535); + *(out + 15) = base + ((*in32 >> 16) & 65535); /* remaining: 0 bits */ return 32; } -static uint32_t -pack17_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack17_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (17 - 2); + tmp = (*(in + 1) - base) >> (17 - 2); tmp |= (*(in + 2) - base) << 2; tmp |= (*(in + 3) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (17 - 4); + tmp = (*(in + 3) - base) >> (17 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (17 - 6); + tmp = (*(in + 5) - base) >> (17 - 6); tmp |= (*(in + 6) - base) << 6; tmp |= (*(in + 7) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (17 - 8); + tmp = (*(in + 7) - base) >> (17 - 8); tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 9) - base) >> (17 - 10); + tmp = (*(in + 9) - base) >> (17 - 10); tmp |= (*(in + 10) - base) << 10; tmp |= (*(in + 11) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 11) - base) >> (17 - 12); + tmp = (*(in + 11) - base) >> (17 - 12); tmp |= (*(in + 12) - base) << 12; tmp |= (*(in + 13) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 13) - base) >> (17 - 14); + tmp = (*(in + 13) - base) >> (17 - 14); tmp |= (*(in + 14) - base) << 14; tmp |= (*(in + 15) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 15) - base) >> (17 - 16); + tmp = (*(in + 15) - base) >> (17 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 34) */ @@ -7552,53 +7441,53 @@ pack17_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 34; } -static uint32_t -unpack17_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack17_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 131071); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 131071); tmp = (*in32 >> 17); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 2)) << (17 - 2); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 2) & 131071); + *(out + 2) = base + ((*in32 >> 2) & 131071); tmp = (*in32 >> 19); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 4)) << (17 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 131071); + *(out + 4) = base + ((*in32 >> 4) & 131071); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 6)) << (17 - 6); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 6) & 131071); + *(out + 6) = base + ((*in32 >> 6) & 131071); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 8)) << (17 - 8); *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 131071); + *(out + 8) = base + ((*in32 >> 8) & 131071); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 10)) << (17 - 10); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 10) & 131071); + *(out + 10) = base + ((*in32 >> 10) & 131071); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 12)) << (17 - 12); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 12) & 131071); + *(out + 12) = base + ((*in32 >> 12) & 131071); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 14)) << (17 - 14); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 14) & 131071); + *(out + 14) = base + ((*in32 >> 14) & 131071); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 36) */ @@ -7608,56 +7497,55 @@ unpack17_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 34; } -static uint32_t -pack18_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack18_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (18 - 4); + tmp = (*(in + 1) - base) >> (18 - 4); tmp |= (*(in + 2) - base) << 4; tmp |= (*(in + 3) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (18 - 8); + tmp = (*(in + 3) - base) >> (18 - 8); tmp |= (*(in + 4) - base) << 8; tmp |= (*(in + 5) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (18 - 12); + tmp = (*(in + 5) - base) >> (18 - 12); tmp |= (*(in + 6) - base) << 12; tmp |= (*(in + 7) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (18 - 16); + tmp = (*(in + 7) - base) >> (18 - 16); tmp |= (*(in + 8) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) >> (18 - 2); + tmp = (*(in + 8) - base) >> (18 - 2); tmp |= (*(in + 9) - base) << 2; tmp |= (*(in + 10) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 10) - base) >> (18 - 6); + tmp = (*(in + 10) - base) >> (18 - 6); tmp |= (*(in + 11) - base) << 6; tmp |= (*(in + 12) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 12) - base) >> (18 - 10); + tmp = (*(in + 12) - base) >> (18 - 10); tmp |= (*(in + 13) - base) << 10; tmp |= (*(in + 14) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 14) - base) >> (18 - 14); + tmp = (*(in + 14) - base) >> (18 - 14); tmp |= (*(in + 15) - base) << 14; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -7666,29 +7554,29 @@ pack18_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 36; } -static uint32_t -unpack18_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack18_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 262143); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 262143); tmp = (*in32 >> 18); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (18 - 4); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 4) & 262143); + *(out + 2) = base + ((*in32 >> 4) & 262143); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (18 - 8); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 262143); + *(out + 4) = base + ((*in32 >> 8) & 262143); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 12)) << (18 - 12); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 12) & 262143); + *(out + 6) = base + ((*in32 >> 12) & 262143); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ @@ -7699,84 +7587,83 @@ unpack18_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 2)) << (18 - 2); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 2) & 262143); + *(out + 9) = base + ((*in32 >> 2) & 262143); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 6)) << (18 - 6); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 6) & 262143); + *(out + 11) = base + ((*in32 >> 6) & 262143); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 10)) << (18 - 10); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 10) & 262143); + *(out + 13) = base + ((*in32 >> 10) & 262143); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 14)) << (18 - 14); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 14) & 262143); + *(out + 15) = base + ((*in32 >> 14) & 262143); /* remaining: 0 bits */ return 36; } -static uint32_t -pack19_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack19_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (19 - 6); + tmp = (*(in + 1) - base) >> (19 - 6); tmp |= (*(in + 2) - base) << 6; tmp |= (*(in + 3) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (19 - 12); + tmp = (*(in + 3) - base) >> (19 - 12); tmp |= (*(in + 4) - base) << 12; tmp |= (*(in + 5) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (19 - 18); + tmp = (*(in + 5) - base) >> (19 - 18); tmp |= (*(in + 6) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (19 - 5); + tmp = (*(in + 6) - base) >> (19 - 5); tmp |= (*(in + 7) - base) << 5; tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) >> (19 - 11); + tmp = (*(in + 8) - base) >> (19 - 11); tmp |= (*(in + 9) - base) << 11; tmp |= (*(in + 10) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 10) - base) >> (19 - 17); + tmp = (*(in + 10) - base) >> (19 - 17); tmp |= (*(in + 11) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 11) - base) >> (19 - 4); + tmp = (*(in + 11) - base) >> (19 - 4); tmp |= (*(in + 12) - base) << 4; tmp |= (*(in + 13) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 13) - base) >> (19 - 10); + tmp = (*(in + 13) - base) >> (19 - 10); tmp |= (*(in + 14) - base) << 10; tmp |= (*(in + 15) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 15) - base) >> (19 - 16); + tmp = (*(in + 15) - base) >> (19 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 38) */ @@ -7784,23 +7671,23 @@ pack19_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 38; } -static uint32_t -unpack19_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack19_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 524287); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 524287); tmp = (*in32 >> 19); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 6)) << (19 - 6); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 6) & 524287); + *(out + 2) = base + ((*in32 >> 6) & 524287); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 12)) << (19 - 12); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 12) & 524287); + *(out + 4) = base + ((*in32 >> 12) & 524287); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 16) */ @@ -7811,13 +7698,13 @@ unpack19_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 5)) << (19 - 5); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 5) & 524287); + *(out + 7) = base + ((*in32 >> 5) & 524287); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 11)) << (19 - 11); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 11) & 524287); + *(out + 9) = base + ((*in32 >> 11) & 524287); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 28) */ @@ -7828,13 +7715,13 @@ unpack19_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 4)) << (19 - 4); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 524287); + *(out + 12) = base + ((*in32 >> 4) & 524287); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 10)) << (19 - 10); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 10) & 524287); + *(out + 14) = base + ((*in32 >> 10) & 524287); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 40) */ @@ -7844,59 +7731,58 @@ unpack19_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 38; } -static uint32_t -pack20_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack20_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (20 - 8); + tmp = (*(in + 1) - base) >> (20 - 8); tmp |= (*(in + 2) - base) << 8; tmp |= (*(in + 3) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (20 - 16); + tmp = (*(in + 3) - base) >> (20 - 16); tmp |= (*(in + 4) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (20 - 4); + tmp = (*(in + 4) - base) >> (20 - 4); tmp |= (*(in + 5) - base) << 4; tmp |= (*(in + 6) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (20 - 12); + tmp = (*(in + 6) - base) >> (20 - 12); tmp |= (*(in + 7) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 9) - base) >> (20 - 8); + tmp = (*(in + 9) - base) >> (20 - 8); tmp |= (*(in + 10) - base) << 8; tmp |= (*(in + 11) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 11) - base) >> (20 - 16); + tmp = (*(in + 11) - base) >> (20 - 16); tmp |= (*(in + 12) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 12) - base) >> (20 - 4); + tmp = (*(in + 12) - base) >> (20 - 4); tmp |= (*(in + 13) - base) << 4; tmp |= (*(in + 14) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 14) - base) >> (20 - 12); + tmp = (*(in + 14) - base) >> (20 - 12); tmp |= (*(in + 15) - base) << 12; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -7905,17 +7791,17 @@ pack20_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 40; } -static uint32_t -unpack20_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack20_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1048575); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1048575); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 8)) << (20 - 8); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 8) & 1048575); + *(out + 2) = base + ((*in32 >> 8) & 1048575); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ @@ -7926,22 +7812,22 @@ unpack20_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (20 - 4); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 4) & 1048575); + *(out + 5) = base + ((*in32 >> 4) & 1048575); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 12)) << (20 - 12); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 12) & 1048575); + *(out + 7) = base + ((*in32 >> 12) & 1048575); in32++; /* consumed: 4 bytes (total: 24) */ - *(out + 8) = base + ((*in32 >> 0) & 1048575); + *(out + 8) = base + ((*in32 >> 0) & 1048575); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 8)) << (20 - 8); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 8) & 1048575); + *(out + 10) = base + ((*in32 >> 8) & 1048575); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 32) */ @@ -7952,76 +7838,75 @@ unpack20_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 4)) << (20 - 4); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 4) & 1048575); + *(out + 13) = base + ((*in32 >> 4) & 1048575); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 12)) << (20 - 12); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 12) & 1048575); + *(out + 15) = base + ((*in32 >> 12) & 1048575); /* remaining: 0 bits */ return 40; } -static uint32_t -pack21_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack21_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (21 - 10); + tmp = (*(in + 1) - base) >> (21 - 10); tmp |= (*(in + 2) - base) << 10; tmp |= (*(in + 3) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (21 - 20); + tmp = (*(in + 3) - base) >> (21 - 20); tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (21 - 9); + tmp = (*(in + 4) - base) >> (21 - 9); tmp |= (*(in + 5) - base) << 9; tmp |= (*(in + 6) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (21 - 19); + tmp = (*(in + 6) - base) >> (21 - 19); tmp |= (*(in + 7) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (21 - 8); + tmp = (*(in + 7) - base) >> (21 - 8); tmp |= (*(in + 8) - base) << 8; tmp |= (*(in + 9) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 9) - base) >> (21 - 18); + tmp = (*(in + 9) - base) >> (21 - 18); tmp |= (*(in + 10) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 10) - base) >> (21 - 7); + tmp = (*(in + 10) - base) >> (21 - 7); tmp |= (*(in + 11) - base) << 7; tmp |= (*(in + 12) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 12) - base) >> (21 - 17); + tmp = (*(in + 12) - base) >> (21 - 17); tmp |= (*(in + 13) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 13) - base) >> (21 - 6); + tmp = (*(in + 13) - base) >> (21 - 6); tmp |= (*(in + 14) - base) << 6; tmp |= (*(in + 15) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 15) - base) >> (21 - 16); + tmp = (*(in + 15) - base) >> (21 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 42) */ @@ -8029,17 +7914,17 @@ pack21_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 42; } -static uint32_t -unpack21_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack21_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2097151); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2097151); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 10)) << (21 - 10); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 10) & 2097151); + *(out + 2) = base + ((*in32 >> 10) & 2097151); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ @@ -8050,7 +7935,7 @@ unpack21_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 9)) << (21 - 9); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 9) & 2097151); + *(out + 5) = base + ((*in32 >> 9) & 2097151); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ @@ -8061,7 +7946,7 @@ unpack21_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (21 - 8); *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 2097151); + *(out + 8) = base + ((*in32 >> 8) & 2097151); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 28) */ @@ -8072,7 +7957,7 @@ unpack21_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 7)) << (21 - 7); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 7) & 2097151); + *(out + 11) = base + ((*in32 >> 7) & 2097151); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ @@ -8083,7 +7968,7 @@ unpack21_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 6)) << (21 - 6); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 6) & 2097151); + *(out + 14) = base + ((*in32 >> 6) & 2097151); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 44) */ @@ -8093,64 +7978,63 @@ unpack21_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 42; } -static uint32_t -pack22_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack22_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (22 - 12); + tmp = (*(in + 1) - base) >> (22 - 12); tmp |= (*(in + 2) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (22 - 2); + tmp = (*(in + 2) - base) >> (22 - 2); tmp |= (*(in + 3) - base) << 2; tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (22 - 14); + tmp = (*(in + 4) - base) >> (22 - 14); tmp |= (*(in + 5) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (22 - 4); + tmp = (*(in + 5) - base) >> (22 - 4); tmp |= (*(in + 6) - base) << 4; tmp |= (*(in + 7) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (22 - 16); + tmp = (*(in + 7) - base) >> (22 - 16); tmp |= (*(in + 8) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) >> (22 - 6); + tmp = (*(in + 8) - base) >> (22 - 6); tmp |= (*(in + 9) - base) << 6; tmp |= (*(in + 10) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 10) - base) >> (22 - 18); + tmp = (*(in + 10) - base) >> (22 - 18); tmp |= (*(in + 11) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 11) - base) >> (22 - 8); + tmp = (*(in + 11) - base) >> (22 - 8); tmp |= (*(in + 12) - base) << 8; tmp |= (*(in + 13) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 13) - base) >> (22 - 20); + tmp = (*(in + 13) - base) >> (22 - 20); tmp |= (*(in + 14) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 14) - base) >> (22 - 10); + tmp = (*(in + 14) - base) >> (22 - 10); tmp |= (*(in + 15) - base) << 10; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -8159,11 +8043,11 @@ pack22_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 44; } -static uint32_t -unpack22_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack22_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4194303); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 4194303); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 8) */ @@ -8174,7 +8058,7 @@ unpack22_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (22 - 2); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 2) & 4194303); + *(out + 3) = base + ((*in32 >> 2) & 4194303); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 16) */ @@ -8185,7 +8069,7 @@ unpack22_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 4)) << (22 - 4); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 4194303); + *(out + 6) = base + ((*in32 >> 4) & 4194303); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 24) */ @@ -8196,7 +8080,7 @@ unpack22_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 6)) << (22 - 6); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 6) & 4194303); + *(out + 9) = base + ((*in32 >> 6) & 4194303); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 32) */ @@ -8207,7 +8091,7 @@ unpack22_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 8)) << (22 - 8); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 8) & 4194303); + *(out + 12) = base + ((*in32 >> 8) & 4194303); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 40) */ @@ -8218,74 +8102,73 @@ unpack22_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 10)) << (22 - 10); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 10) & 4194303); + *(out + 15) = base + ((*in32 >> 10) & 4194303); /* remaining: 0 bits */ return 44; } -static uint32_t -pack23_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack23_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (23 - 14); + tmp = (*(in + 1) - base) >> (23 - 14); tmp |= (*(in + 2) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (23 - 5); + tmp = (*(in + 2) - base) >> (23 - 5); tmp |= (*(in + 3) - base) << 5; tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (23 - 19); + tmp = (*(in + 4) - base) >> (23 - 19); tmp |= (*(in + 5) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (23 - 10); + tmp = (*(in + 5) - base) >> (23 - 10); tmp |= (*(in + 6) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (23 - 1); + tmp = (*(in + 6) - base) >> (23 - 1); tmp |= (*(in + 7) - base) << 1; tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) >> (23 - 15); + tmp = (*(in + 8) - base) >> (23 - 15); tmp |= (*(in + 9) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 9) - base) >> (23 - 6); + tmp = (*(in + 9) - base) >> (23 - 6); tmp |= (*(in + 10) - base) << 6; tmp |= (*(in + 11) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 11) - base) >> (23 - 20); + tmp = (*(in + 11) - base) >> (23 - 20); tmp |= (*(in + 12) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 12) - base) >> (23 - 11); + tmp = (*(in + 12) - base) >> (23 - 11); tmp |= (*(in + 13) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 13) - base) >> (23 - 2); + tmp = (*(in + 13) - base) >> (23 - 2); tmp |= (*(in + 14) - base) << 2; tmp |= (*(in + 15) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 15) - base) >> (23 - 16); + tmp = (*(in + 15) - base) >> (23 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 46) */ @@ -8293,11 +8176,11 @@ pack23_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 46; } -static uint32_t -unpack23_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack23_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8388607); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 8388607); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 8) */ @@ -8308,7 +8191,7 @@ unpack23_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 5)) << (23 - 5); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 5) & 8388607); + *(out + 3) = base + ((*in32 >> 5) & 8388607); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 16) */ @@ -8324,7 +8207,7 @@ unpack23_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 1)) << (23 - 1); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 1) & 8388607); + *(out + 7) = base + ((*in32 >> 1) & 8388607); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 28) */ @@ -8335,7 +8218,7 @@ unpack23_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 6)) << (23 - 6); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 6) & 8388607); + *(out + 10) = base + ((*in32 >> 6) & 8388607); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 36) */ @@ -8351,7 +8234,7 @@ unpack23_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 2)) << (23 - 2); *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 2) & 8388607); + *(out + 14) = base + ((*in32 >> 2) & 8388607); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 48) */ @@ -8361,65 +8244,64 @@ unpack23_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 46; } -static uint32_t -pack24_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack24_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (24 - 16); + tmp = (*(in + 1) - base) >> (24 - 16); tmp |= (*(in + 2) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (24 - 8); + tmp = (*(in + 2) - base) >> (24 - 8); tmp |= (*(in + 3) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (24 - 16); + tmp = (*(in + 5) - base) >> (24 - 16); tmp |= (*(in + 6) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (24 - 8); + tmp = (*(in + 6) - base) >> (24 - 8); tmp |= (*(in + 7) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 9) - base) >> (24 - 16); + tmp = (*(in + 9) - base) >> (24 - 16); tmp |= (*(in + 10) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 10) - base) >> (24 - 8); + tmp = (*(in + 10) - base) >> (24 - 8); tmp |= (*(in + 11) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 12) - base) << 0; + tmp = (*(in + 12) - base) << 0; tmp |= (*(in + 13) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 13) - base) >> (24 - 16); + tmp = (*(in + 13) - base) >> (24 - 16); tmp |= (*(in + 14) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 14) - base) >> (24 - 8); + tmp = (*(in + 14) - base) >> (24 - 8); tmp |= (*(in + 15) - base) << 8; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -8428,11 +8310,11 @@ pack24_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 48; } -static uint32_t -unpack24_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack24_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16777215); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 8) */ @@ -8443,10 +8325,10 @@ unpack24_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 8) & 16777215); + *(out + 3) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 4) = base + ((*in32 >> 0) & 16777215); + *(out + 4) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ @@ -8457,10 +8339,10 @@ unpack24_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 8) & 16777215); + *(out + 7) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 28) */ - *(out + 8) = base + ((*in32 >> 0) & 16777215); + *(out + 8) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 32) */ @@ -8471,10 +8353,10 @@ unpack24_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 8) & 16777215); + *(out + 11) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 40) */ - *(out + 12) = base + ((*in32 >> 0) & 16777215); + *(out + 12) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 44) */ @@ -8485,78 +8367,77 @@ unpack24_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 48) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 8) & 16777215); + *(out + 15) = base + ((*in32 >> 8) & 16777215); /* remaining: 0 bits */ return 48; } -static uint32_t -pack25_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack25_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (25 - 18); + tmp = (*(in + 1) - base) >> (25 - 18); tmp |= (*(in + 2) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (25 - 11); + tmp = (*(in + 2) - base) >> (25 - 11); tmp |= (*(in + 3) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (25 - 4); + tmp = (*(in + 3) - base) >> (25 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (25 - 22); + tmp = (*(in + 5) - base) >> (25 - 22); tmp |= (*(in + 6) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (25 - 15); + tmp = (*(in + 6) - base) >> (25 - 15); tmp |= (*(in + 7) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (25 - 8); + tmp = (*(in + 7) - base) >> (25 - 8); tmp |= (*(in + 8) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (25 - 1); + tmp = (*(in + 8) - base) >> (25 - 1); tmp |= (*(in + 9) - base) << 1; tmp |= (*(in + 10) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 10) - base) >> (25 - 19); + tmp = (*(in + 10) - base) >> (25 - 19); tmp |= (*(in + 11) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 11) - base) >> (25 - 12); + tmp = (*(in + 11) - base) >> (25 - 12); tmp |= (*(in + 12) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 12) - base) >> (25 - 5); + tmp = (*(in + 12) - base) >> (25 - 5); tmp |= (*(in + 13) - base) << 5; tmp |= (*(in + 14) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 14) - base) >> (25 - 23); + tmp = (*(in + 14) - base) >> (25 - 23); tmp |= (*(in + 15) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 15) - base) >> (25 - 16); + tmp = (*(in + 15) - base) >> (25 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 50) */ @@ -8564,11 +8445,11 @@ pack25_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 50; } -static uint32_t -unpack25_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack25_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 33554431); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 33554431); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 8) */ @@ -8584,7 +8465,7 @@ unpack25_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (25 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 33554431); + *(out + 4) = base + ((*in32 >> 4) & 33554431); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 20) */ @@ -8605,7 +8486,7 @@ unpack25_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 32) */ tmp |= (*in32 % (1U << 1)) << (25 - 1); *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 1) & 33554431); + *(out + 9) = base + ((*in32 >> 1) & 33554431); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 36) */ @@ -8621,7 +8502,7 @@ unpack25_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 5)) << (25 - 5); *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 5) & 33554431); + *(out + 13) = base + ((*in32 >> 5) & 33554431); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 48) */ @@ -8636,72 +8517,71 @@ unpack25_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 50; } -static uint32_t -pack26_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack26_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (26 - 20); + tmp = (*(in + 1) - base) >> (26 - 20); tmp |= (*(in + 2) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (26 - 14); + tmp = (*(in + 2) - base) >> (26 - 14); tmp |= (*(in + 3) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (26 - 8); + tmp = (*(in + 3) - base) >> (26 - 8); tmp |= (*(in + 4) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (26 - 2); + tmp = (*(in + 4) - base) >> (26 - 2); tmp |= (*(in + 5) - base) << 2; tmp |= (*(in + 6) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (26 - 22); + tmp = (*(in + 6) - base) >> (26 - 22); tmp |= (*(in + 7) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (26 - 16); + tmp = (*(in + 7) - base) >> (26 - 16); tmp |= (*(in + 8) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (26 - 10); + tmp = (*(in + 8) - base) >> (26 - 10); tmp |= (*(in + 9) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (26 - 4); + tmp = (*(in + 9) - base) >> (26 - 4); tmp |= (*(in + 10) - base) << 4; tmp |= (*(in + 11) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 11) - base) >> (26 - 24); + tmp = (*(in + 11) - base) >> (26 - 24); tmp |= (*(in + 12) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 12) - base) >> (26 - 18); + tmp = (*(in + 12) - base) >> (26 - 18); tmp |= (*(in + 13) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 13) - base) >> (26 - 12); + tmp = (*(in + 13) - base) >> (26 - 12); tmp |= (*(in + 14) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 14) - base) >> (26 - 6); + tmp = (*(in + 14) - base) >> (26 - 6); tmp |= (*(in + 15) - base) << 6; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -8710,11 +8590,11 @@ pack26_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 52; } -static uint32_t -unpack26_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack26_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 67108863); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 67108863); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 8) */ @@ -8735,7 +8615,7 @@ unpack26_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 2)) << (26 - 2); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 2) & 67108863); + *(out + 5) = base + ((*in32 >> 2) & 67108863); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 24) */ @@ -8756,7 +8636,7 @@ unpack26_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 36) */ tmp |= (*in32 % (1U << 4)) << (26 - 4); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 4) & 67108863); + *(out + 10) = base + ((*in32 >> 4) & 67108863); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 40) */ @@ -8777,82 +8657,81 @@ unpack26_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 52) */ tmp |= (*in32 % (1U << 6)) << (26 - 6); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 6) & 67108863); + *(out + 15) = base + ((*in32 >> 6) & 67108863); /* remaining: 0 bits */ return 52; } -static uint32_t -pack27_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack27_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (27 - 22); + tmp = (*(in + 1) - base) >> (27 - 22); tmp |= (*(in + 2) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (27 - 17); + tmp = (*(in + 2) - base) >> (27 - 17); tmp |= (*(in + 3) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (27 - 12); + tmp = (*(in + 3) - base) >> (27 - 12); tmp |= (*(in + 4) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (27 - 7); + tmp = (*(in + 4) - base) >> (27 - 7); tmp |= (*(in + 5) - base) << 7; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (27 - 2); + tmp = (*(in + 5) - base) >> (27 - 2); tmp |= (*(in + 6) - base) << 2; tmp |= (*(in + 7) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (27 - 24); + tmp = (*(in + 7) - base) >> (27 - 24); tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (27 - 19); + tmp = (*(in + 8) - base) >> (27 - 19); tmp |= (*(in + 9) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (27 - 14); + tmp = (*(in + 9) - base) >> (27 - 14); tmp |= (*(in + 10) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 10) - base) >> (27 - 9); + tmp = (*(in + 10) - base) >> (27 - 9); tmp |= (*(in + 11) - base) << 9; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (27 - 4); + tmp = (*(in + 11) - base) >> (27 - 4); tmp |= (*(in + 12) - base) << 4; tmp |= (*(in + 13) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 13) - base) >> (27 - 26); + tmp = (*(in + 13) - base) >> (27 - 26); tmp |= (*(in + 14) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 14) - base) >> (27 - 21); + tmp = (*(in + 14) - base) >> (27 - 21); tmp |= (*(in + 15) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 15) - base) >> (27 - 16); + tmp = (*(in + 15) - base) >> (27 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 54) */ @@ -8860,11 +8739,11 @@ pack27_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 54; } -static uint32_t -unpack27_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack27_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 134217727); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 134217727); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 8) */ @@ -8890,7 +8769,7 @@ unpack27_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 2)) << (27 - 2); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 134217727); + *(out + 6) = base + ((*in32 >> 2) & 134217727); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 28) */ @@ -8916,7 +8795,7 @@ unpack27_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 44) */ tmp |= (*in32 % (1U << 4)) << (27 - 4); *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 134217727); + *(out + 12) = base + ((*in32 >> 4) & 134217727); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 48) */ @@ -8936,75 +8815,74 @@ unpack27_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 54; } -static uint32_t -pack28_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack28_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (28 - 24); + tmp = (*(in + 1) - base) >> (28 - 24); tmp |= (*(in + 2) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (28 - 20); + tmp = (*(in + 2) - base) >> (28 - 20); tmp |= (*(in + 3) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (28 - 16); + tmp = (*(in + 3) - base) >> (28 - 16); tmp |= (*(in + 4) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (28 - 12); + tmp = (*(in + 4) - base) >> (28 - 12); tmp |= (*(in + 5) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (28 - 8); + tmp = (*(in + 5) - base) >> (28 - 8); tmp |= (*(in + 6) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (28 - 4); + tmp = (*(in + 6) - base) >> (28 - 4); tmp |= (*(in + 7) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) << 0; + tmp = (*(in + 8) - base) << 0; tmp |= (*(in + 9) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (28 - 24); + tmp = (*(in + 9) - base) >> (28 - 24); tmp |= (*(in + 10) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 10) - base) >> (28 - 20); + tmp = (*(in + 10) - base) >> (28 - 20); tmp |= (*(in + 11) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (28 - 16); + tmp = (*(in + 11) - base) >> (28 - 16); tmp |= (*(in + 12) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 12) - base) >> (28 - 12); + tmp = (*(in + 12) - base) >> (28 - 12); tmp |= (*(in + 13) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 13) - base) >> (28 - 8); + tmp = (*(in + 13) - base) >> (28 - 8); tmp |= (*(in + 14) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 14) - base) >> (28 - 4); + tmp = (*(in + 14) - base) >> (28 - 4); tmp |= (*(in + 15) - base) << 4; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -9013,11 +8891,11 @@ pack28_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 56; } -static uint32_t -unpack28_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack28_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 268435455); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 268435455); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ @@ -9048,10 +8926,10 @@ unpack28_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 4)) << (28 - 4); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 4) & 268435455); + *(out + 7) = base + ((*in32 >> 4) & 268435455); in32++; /* consumed: 4 bytes (total: 32) */ - *(out + 8) = base + ((*in32 >> 0) & 268435455); + *(out + 8) = base + ((*in32 >> 0) & 268435455); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 36) */ @@ -9082,86 +8960,85 @@ unpack28_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 56) */ tmp |= (*in32 % (1U << 4)) << (28 - 4); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 4) & 268435455); + *(out + 15) = base + ((*in32 >> 4) & 268435455); /* remaining: 0 bits */ return 56; } -static uint32_t -pack29_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack29_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (29 - 26); + tmp = (*(in + 1) - base) >> (29 - 26); tmp |= (*(in + 2) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (29 - 23); + tmp = (*(in + 2) - base) >> (29 - 23); tmp |= (*(in + 3) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (29 - 20); + tmp = (*(in + 3) - base) >> (29 - 20); tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (29 - 17); + tmp = (*(in + 4) - base) >> (29 - 17); tmp |= (*(in + 5) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (29 - 14); + tmp = (*(in + 5) - base) >> (29 - 14); tmp |= (*(in + 6) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (29 - 11); + tmp = (*(in + 6) - base) >> (29 - 11); tmp |= (*(in + 7) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (29 - 8); + tmp = (*(in + 7) - base) >> (29 - 8); tmp |= (*(in + 8) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (29 - 5); + tmp = (*(in + 8) - base) >> (29 - 5); tmp |= (*(in + 9) - base) << 5; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (29 - 2); + tmp = (*(in + 9) - base) >> (29 - 2); tmp |= (*(in + 10) - base) << 2; tmp |= (*(in + 11) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (29 - 28); + tmp = (*(in + 11) - base) >> (29 - 28); tmp |= (*(in + 12) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 12) - base) >> (29 - 25); + tmp = (*(in + 12) - base) >> (29 - 25); tmp |= (*(in + 13) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 13) - base) >> (29 - 22); + tmp = (*(in + 13) - base) >> (29 - 22); tmp |= (*(in + 14) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 14) - base) >> (29 - 19); + tmp = (*(in + 14) - base) >> (29 - 19); tmp |= (*(in + 15) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 15) - base) >> (29 - 16); + tmp = (*(in + 15) - base) >> (29 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 58) */ @@ -9169,11 +9046,11 @@ pack29_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 58; } -static uint32_t -unpack29_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack29_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 536870911); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 536870911); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 8) */ @@ -9219,7 +9096,7 @@ unpack29_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 40) */ tmp |= (*in32 % (1U << 2)) << (29 - 2); *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 2) & 536870911); + *(out + 10) = base + ((*in32 >> 2) & 536870911); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 44) */ @@ -9249,80 +9126,79 @@ unpack29_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 58; } -static uint32_t -pack30_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack30_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (30 - 28); + tmp = (*(in + 1) - base) >> (30 - 28); tmp |= (*(in + 2) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (30 - 26); + tmp = (*(in + 2) - base) >> (30 - 26); tmp |= (*(in + 3) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (30 - 24); + tmp = (*(in + 3) - base) >> (30 - 24); tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (30 - 22); + tmp = (*(in + 4) - base) >> (30 - 22); tmp |= (*(in + 5) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (30 - 20); + tmp = (*(in + 5) - base) >> (30 - 20); tmp |= (*(in + 6) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (30 - 18); + tmp = (*(in + 6) - base) >> (30 - 18); tmp |= (*(in + 7) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (30 - 16); + tmp = (*(in + 7) - base) >> (30 - 16); tmp |= (*(in + 8) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (30 - 14); + tmp = (*(in + 8) - base) >> (30 - 14); tmp |= (*(in + 9) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (30 - 12); + tmp = (*(in + 9) - base) >> (30 - 12); tmp |= (*(in + 10) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 10) - base) >> (30 - 10); + tmp = (*(in + 10) - base) >> (30 - 10); tmp |= (*(in + 11) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 11) - base) >> (30 - 8); + tmp = (*(in + 11) - base) >> (30 - 8); tmp |= (*(in + 12) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 12) - base) >> (30 - 6); + tmp = (*(in + 12) - base) >> (30 - 6); tmp |= (*(in + 13) - base) << 6; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 13) - base) >> (30 - 4); + tmp = (*(in + 13) - base) >> (30 - 4); tmp |= (*(in + 14) - base) << 4; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 14) - base) >> (30 - 2); + tmp = (*(in + 14) - base) >> (30 - 2); tmp |= (*(in + 15) - base) << 2; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -9331,11 +9207,11 @@ pack30_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 60; } -static uint32_t -unpack30_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack30_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1073741823); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1073741823); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ @@ -9406,90 +9282,89 @@ unpack30_16(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 60) */ tmp |= (*in32 % (1U << 2)) << (30 - 2); *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 2) & 1073741823); + *(out + 15) = base + ((*in32 >> 2) & 1073741823); /* remaining: 0 bits */ return 60; } -static uint32_t -pack31_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack31_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (31 - 30); + tmp = (*(in + 1) - base) >> (31 - 30); tmp |= (*(in + 2) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (31 - 29); + tmp = (*(in + 2) - base) >> (31 - 29); tmp |= (*(in + 3) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (31 - 28); + tmp = (*(in + 3) - base) >> (31 - 28); tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (31 - 27); + tmp = (*(in + 4) - base) >> (31 - 27); tmp |= (*(in + 5) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (31 - 26); + tmp = (*(in + 5) - base) >> (31 - 26); tmp |= (*(in + 6) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (31 - 25); + tmp = (*(in + 6) - base) >> (31 - 25); tmp |= (*(in + 7) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (31 - 24); + tmp = (*(in + 7) - base) >> (31 - 24); tmp |= (*(in + 8) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (31 - 23); + tmp = (*(in + 8) - base) >> (31 - 23); tmp |= (*(in + 9) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (31 - 22); + tmp = (*(in + 9) - base) >> (31 - 22); tmp |= (*(in + 10) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 10) - base) >> (31 - 21); + tmp = (*(in + 10) - base) >> (31 - 21); tmp |= (*(in + 11) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 11) - base) >> (31 - 20); + tmp = (*(in + 11) - base) >> (31 - 20); tmp |= (*(in + 12) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 12) - base) >> (31 - 19); + tmp = (*(in + 12) - base) >> (31 - 19); tmp |= (*(in + 13) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 13) - base) >> (31 - 18); + tmp = (*(in + 13) - base) >> (31 - 18); tmp |= (*(in + 14) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 14) - base) >> (31 - 17); + tmp = (*(in + 14) - base) >> (31 - 17); tmp |= (*(in + 15) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 15) - base) >> (31 - 16); + tmp = (*(in + 15) - base) >> (31 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 62) */ @@ -9497,11 +9372,11 @@ pack31_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 62; } -static uint32_t -unpack31_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack31_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2147483647); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2147483647); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 8) */ @@ -9581,8 +9456,7 @@ unpack31_16(uint32_t base, const uint8_t *in, uint32_t *out) { return 62; } -static uint32_t -pack32_16(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack32_16(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t i; uint32_t *out32 = (uint32_t *)out; for (i = 0; i < 16; i++) @@ -9590,8 +9464,7 @@ pack32_16(uint32_t base, const uint32_t *in, uint8_t *out) { return 16 * sizeof(uint32_t); } -static uint32_t -unpack32_16(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack32_16(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t i; uint32_t *in32 = (uint32_t *)in; for (i = 0; i < 16; i++) @@ -9600,81 +9473,24 @@ unpack32_16(uint32_t base, const uint8_t *in, uint32_t *out) { } for_packfunc_t for_pack16[33] = { - pack0_n, - pack1_16, - pack2_16, - pack3_16, - pack4_16, - pack5_16, - pack6_16, - pack7_16, - pack8_16, - pack9_16, - pack10_16, - pack11_16, - pack12_16, - pack13_16, - pack14_16, - pack15_16, - pack16_16, - pack17_16, - pack18_16, - pack19_16, - pack20_16, - pack21_16, - pack22_16, - pack23_16, - pack24_16, - pack25_16, - pack26_16, - pack27_16, - pack28_16, - pack29_16, - pack30_16, - pack31_16, - pack32_16 -}; + pack0_n, pack1_16, pack2_16, pack3_16, pack4_16, pack5_16, pack6_16, + pack7_16, pack8_16, pack9_16, pack10_16, pack11_16, pack12_16, pack13_16, + pack14_16, pack15_16, pack16_16, pack17_16, pack18_16, pack19_16, pack20_16, + pack21_16, pack22_16, pack23_16, pack24_16, pack25_16, pack26_16, pack27_16, + pack28_16, pack29_16, pack30_16, pack31_16, pack32_16}; for_unpackfunc_t for_unpack16[33] = { - unpack0_16, - unpack1_16, - unpack2_16, - unpack3_16, - unpack4_16, - unpack5_16, - unpack6_16, - unpack7_16, - unpack8_16, - unpack9_16, - unpack10_16, - unpack11_16, - unpack12_16, - unpack13_16, - unpack14_16, - unpack15_16, - unpack16_16, - unpack17_16, - unpack18_16, - unpack19_16, - unpack20_16, - unpack21_16, - unpack22_16, - unpack23_16, - unpack24_16, - unpack25_16, - unpack26_16, - unpack27_16, - unpack28_16, - unpack29_16, - unpack30_16, - unpack31_16, - unpack32_16 -}; - -static uint32_t -pack1_8(uint32_t base, const uint32_t *in, uint8_t *out) { + unpack0_16, unpack1_16, unpack2_16, unpack3_16, unpack4_16, + unpack5_16, unpack6_16, unpack7_16, unpack8_16, unpack9_16, + unpack10_16, unpack11_16, unpack12_16, unpack13_16, unpack14_16, + unpack15_16, unpack16_16, unpack17_16, unpack18_16, unpack19_16, + unpack20_16, unpack21_16, unpack22_16, unpack23_16, unpack24_16, + unpack25_16, unpack26_16, unpack27_16, unpack28_16, unpack29_16, + unpack30_16, unpack31_16, unpack32_16}; + +static uint32_t pack1_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 1; tmp |= (*(in + 2) - base) << 2; tmp |= (*(in + 3) - base) << 3; @@ -9689,26 +9505,25 @@ pack1_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 1; } -static uint32_t -unpack1_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack1_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1); - *(out + 1) = base + ((*in32 >> 1) & 1); - *(out + 2) = base + ((*in32 >> 2) & 1); - *(out + 3) = base + ((*in32 >> 3) & 1); - *(out + 4) = base + ((*in32 >> 4) & 1); - *(out + 5) = base + ((*in32 >> 5) & 1); - *(out + 6) = base + ((*in32 >> 6) & 1); - *(out + 7) = base + ((*in32 >> 7) & 1); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1); + *(out + 1) = base + ((*in32 >> 1) & 1); + *(out + 2) = base + ((*in32 >> 2) & 1); + *(out + 3) = base + ((*in32 >> 3) & 1); + *(out + 4) = base + ((*in32 >> 4) & 1); + *(out + 5) = base + ((*in32 >> 5) & 1); + *(out + 6) = base + ((*in32 >> 6) & 1); + *(out + 7) = base + ((*in32 >> 7) & 1); /* remaining: 24 bits */ return 1; } -static uint32_t -pack2_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack2_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 2; tmp |= (*(in + 2) - base) << 4; tmp |= (*(in + 3) - base) << 6; @@ -9723,26 +9538,25 @@ pack2_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 2; } -static uint32_t -unpack2_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack2_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 3); - *(out + 1) = base + ((*in32 >> 2) & 3); - *(out + 2) = base + ((*in32 >> 4) & 3); - *(out + 3) = base + ((*in32 >> 6) & 3); - *(out + 4) = base + ((*in32 >> 8) & 3); - *(out + 5) = base + ((*in32 >> 10) & 3); - *(out + 6) = base + ((*in32 >> 12) & 3); - *(out + 7) = base + ((*in32 >> 14) & 3); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 3); + *(out + 1) = base + ((*in32 >> 2) & 3); + *(out + 2) = base + ((*in32 >> 4) & 3); + *(out + 3) = base + ((*in32 >> 6) & 3); + *(out + 4) = base + ((*in32 >> 8) & 3); + *(out + 5) = base + ((*in32 >> 10) & 3); + *(out + 6) = base + ((*in32 >> 12) & 3); + *(out + 7) = base + ((*in32 >> 14) & 3); /* remaining: 16 bits */ return 2; } -static uint32_t -pack3_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack3_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 3; tmp |= (*(in + 2) - base) << 6; tmp |= (*(in + 3) - base) << 9; @@ -9757,26 +9571,25 @@ pack3_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 3; } -static uint32_t -unpack3_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack3_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 7); - *(out + 1) = base + ((*in32 >> 3) & 7); - *(out + 2) = base + ((*in32 >> 6) & 7); - *(out + 3) = base + ((*in32 >> 9) & 7); - *(out + 4) = base + ((*in32 >> 12) & 7); - *(out + 5) = base + ((*in32 >> 15) & 7); - *(out + 6) = base + ((*in32 >> 18) & 7); - *(out + 7) = base + ((*in32 >> 21) & 7); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 7); + *(out + 1) = base + ((*in32 >> 3) & 7); + *(out + 2) = base + ((*in32 >> 6) & 7); + *(out + 3) = base + ((*in32 >> 9) & 7); + *(out + 4) = base + ((*in32 >> 12) & 7); + *(out + 5) = base + ((*in32 >> 15) & 7); + *(out + 6) = base + ((*in32 >> 18) & 7); + *(out + 7) = base + ((*in32 >> 21) & 7); /* remaining: 8 bits */ return 3; } -static uint32_t -pack4_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack4_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 4; tmp |= (*(in + 2) - base) << 8; tmp |= (*(in + 3) - base) << 12; @@ -9791,26 +9604,25 @@ pack4_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 4; } -static uint32_t -unpack4_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack4_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 15); - *(out + 1) = base + ((*in32 >> 4) & 15); - *(out + 2) = base + ((*in32 >> 8) & 15); - *(out + 3) = base + ((*in32 >> 12) & 15); - *(out + 4) = base + ((*in32 >> 16) & 15); - *(out + 5) = base + ((*in32 >> 20) & 15); - *(out + 6) = base + ((*in32 >> 24) & 15); - *(out + 7) = base + ((*in32 >> 28) & 15); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 15); + *(out + 1) = base + ((*in32 >> 4) & 15); + *(out + 2) = base + ((*in32 >> 8) & 15); + *(out + 3) = base + ((*in32 >> 12) & 15); + *(out + 4) = base + ((*in32 >> 16) & 15); + *(out + 5) = base + ((*in32 >> 20) & 15); + *(out + 6) = base + ((*in32 >> 24) & 15); + *(out + 7) = base + ((*in32 >> 28) & 15); /* remaining: 0 bits */ return 4; } -static uint32_t -pack5_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack5_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 5; tmp |= (*(in + 2) - base) << 10; tmp |= (*(in + 3) - base) << 15; @@ -9820,7 +9632,7 @@ pack5_8(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 6) - base) >> (5 - 3); + tmp = (*(in + 6) - base) >> (5 - 3); tmp |= (*(in + 7) - base) << 3; /* remaining: 24 bits */ length = (32 / 8) - (32 - 8) / 8; @@ -9829,30 +9641,29 @@ pack5_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 5; } -static uint32_t -unpack5_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack5_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 31); - *(out + 1) = base + ((*in32 >> 5) & 31); - *(out + 2) = base + ((*in32 >> 10) & 31); - *(out + 3) = base + ((*in32 >> 15) & 31); - *(out + 4) = base + ((*in32 >> 20) & 31); - *(out + 5) = base + ((*in32 >> 25) & 31); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 31); + *(out + 1) = base + ((*in32 >> 5) & 31); + *(out + 2) = base + ((*in32 >> 10) & 31); + *(out + 3) = base + ((*in32 >> 15) & 31); + *(out + 4) = base + ((*in32 >> 20) & 31); + *(out + 5) = base + ((*in32 >> 25) & 31); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 3)) << (5 - 3); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 3) & 31); + *(out + 7) = base + ((*in32 >> 3) & 31); /* remaining: 24 bits */ return 5; } -static uint32_t -pack6_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack6_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 6; tmp |= (*(in + 2) - base) << 12; tmp |= (*(in + 3) - base) << 18; @@ -9861,7 +9672,7 @@ pack6_8(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 5) - base) >> (6 - 4); + tmp = (*(in + 5) - base) >> (6 - 4); tmp |= (*(in + 6) - base) << 4; tmp |= (*(in + 7) - base) << 10; /* remaining: 16 bits */ @@ -9871,30 +9682,29 @@ pack6_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 6; } -static uint32_t -unpack6_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack6_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 63); - *(out + 1) = base + ((*in32 >> 6) & 63); - *(out + 2) = base + ((*in32 >> 12) & 63); - *(out + 3) = base + ((*in32 >> 18) & 63); - *(out + 4) = base + ((*in32 >> 24) & 63); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 63); + *(out + 1) = base + ((*in32 >> 6) & 63); + *(out + 2) = base + ((*in32 >> 12) & 63); + *(out + 3) = base + ((*in32 >> 18) & 63); + *(out + 4) = base + ((*in32 >> 24) & 63); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (6 - 4); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 63); - *(out + 7) = base + ((*in32 >> 10) & 63); + *(out + 6) = base + ((*in32 >> 4) & 63); + *(out + 7) = base + ((*in32 >> 10) & 63); /* remaining: 16 bits */ return 6; } -static uint32_t -pack7_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack7_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 7; tmp |= (*(in + 2) - base) << 14; tmp |= (*(in + 3) - base) << 21; @@ -9902,7 +9712,7 @@ pack7_8(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) >> (7 - 3); + tmp = (*(in + 4) - base) >> (7 - 3); tmp |= (*(in + 5) - base) << 3; tmp |= (*(in + 6) - base) << 10; tmp |= (*(in + 7) - base) << 17; @@ -9913,37 +9723,36 @@ pack7_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 7; } -static uint32_t -unpack7_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack7_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 127); - *(out + 1) = base + ((*in32 >> 7) & 127); - *(out + 2) = base + ((*in32 >> 14) & 127); - *(out + 3) = base + ((*in32 >> 21) & 127); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 127); + *(out + 1) = base + ((*in32 >> 7) & 127); + *(out + 2) = base + ((*in32 >> 14) & 127); + *(out + 3) = base + ((*in32 >> 21) & 127); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 3)) << (7 - 3); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 3) & 127); - *(out + 6) = base + ((*in32 >> 10) & 127); - *(out + 7) = base + ((*in32 >> 17) & 127); + *(out + 5) = base + ((*in32 >> 3) & 127); + *(out + 6) = base + ((*in32 >> 10) & 127); + *(out + 7) = base + ((*in32 >> 17) & 127); /* remaining: 8 bits */ return 7; } -static uint32_t -pack8_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack8_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 8; tmp |= (*(in + 2) - base) << 16; tmp |= (*(in + 3) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 8; tmp |= (*(in + 6) - base) << 16; tmp |= (*(in + 7) - base) << 24; @@ -9954,35 +9763,34 @@ pack8_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 8; } -static uint32_t -unpack8_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack8_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 255); - *(out + 1) = base + ((*in32 >> 8) & 255); - *(out + 2) = base + ((*in32 >> 16) & 255); - *(out + 3) = base + ((*in32 >> 24) & 255); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 255); + *(out + 1) = base + ((*in32 >> 8) & 255); + *(out + 2) = base + ((*in32 >> 16) & 255); + *(out + 3) = base + ((*in32 >> 24) & 255); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 4) = base + ((*in32 >> 0) & 255); - *(out + 5) = base + ((*in32 >> 8) & 255); - *(out + 6) = base + ((*in32 >> 16) & 255); - *(out + 7) = base + ((*in32 >> 24) & 255); + *(out + 4) = base + ((*in32 >> 0) & 255); + *(out + 5) = base + ((*in32 >> 8) & 255); + *(out + 6) = base + ((*in32 >> 16) & 255); + *(out + 7) = base + ((*in32 >> 24) & 255); /* remaining: 0 bits */ return 8; } -static uint32_t -pack9_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack9_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 9; tmp |= (*(in + 2) - base) << 18; tmp |= (*(in + 3) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (9 - 4); + tmp = (*(in + 3) - base) >> (9 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 13; tmp |= (*(in + 6) - base) << 22; @@ -9990,7 +9798,7 @@ pack9_8(uint32_t base, const uint32_t *in, uint8_t *out) { *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 7) - base) >> (9 - 8); + tmp = (*(in + 7) - base) >> (9 - 8); /* remaining: 24 bits */ length = (32 / 8) - (32 - 8) / 8; /* consumed: 1 bytes (total: 9) */ @@ -9998,21 +9806,21 @@ pack9_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 9; } -static uint32_t -unpack9_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack9_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 511); - *(out + 1) = base + ((*in32 >> 9) & 511); - *(out + 2) = base + ((*in32 >> 18) & 511); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 511); + *(out + 1) = base + ((*in32 >> 9) & 511); + *(out + 2) = base + ((*in32 >> 18) & 511); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (9 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 511); - *(out + 5) = base + ((*in32 >> 13) & 511); - *(out + 6) = base + ((*in32 >> 22) & 511); + *(out + 4) = base + ((*in32 >> 4) & 511); + *(out + 5) = base + ((*in32 >> 13) & 511); + *(out + 6) = base + ((*in32 >> 22) & 511); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ @@ -10022,24 +9830,23 @@ unpack9_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 9; } -static uint32_t -pack10_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack10_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 10; tmp |= (*(in + 2) - base) << 20; tmp |= (*(in + 3) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (10 - 8); + tmp = (*(in + 3) - base) >> (10 - 8); tmp |= (*(in + 4) - base) << 8; tmp |= (*(in + 5) - base) << 18; tmp |= (*(in + 6) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 6) - base) >> (10 - 6); + tmp = (*(in + 6) - base) >> (10 - 6); tmp |= (*(in + 7) - base) << 6; /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; @@ -10048,47 +9855,46 @@ pack10_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 10; } -static uint32_t -unpack10_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack10_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1023); - *(out + 1) = base + ((*in32 >> 10) & 1023); - *(out + 2) = base + ((*in32 >> 20) & 1023); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1023); + *(out + 1) = base + ((*in32 >> 10) & 1023); + *(out + 2) = base + ((*in32 >> 20) & 1023); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 8)) << (10 - 8); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 1023); - *(out + 5) = base + ((*in32 >> 18) & 1023); + *(out + 4) = base + ((*in32 >> 8) & 1023); + *(out + 5) = base + ((*in32 >> 18) & 1023); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 6)) << (10 - 6); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 6) & 1023); + *(out + 7) = base + ((*in32 >> 6) & 1023); /* remaining: 16 bits */ return 10; } -static uint32_t -pack11_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack11_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 11; tmp |= (*(in + 2) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (11 - 1); + tmp = (*(in + 2) - base) >> (11 - 1); tmp |= (*(in + 3) - base) << 1; tmp |= (*(in + 4) - base) << 12; tmp |= (*(in + 5) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (11 - 2); + tmp = (*(in + 5) - base) >> (11 - 2); tmp |= (*(in + 6) - base) << 2; tmp |= (*(in + 7) - base) << 13; /* remaining: 8 bits */ @@ -10098,47 +9904,46 @@ pack11_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 11; } -static uint32_t -unpack11_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack11_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2047); - *(out + 1) = base + ((*in32 >> 11) & 2047); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2047); + *(out + 1) = base + ((*in32 >> 11) & 2047); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 1)) << (11 - 1); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 1) & 2047); - *(out + 4) = base + ((*in32 >> 12) & 2047); + *(out + 3) = base + ((*in32 >> 1) & 2047); + *(out + 4) = base + ((*in32 >> 12) & 2047); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (11 - 2); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 2047); - *(out + 7) = base + ((*in32 >> 13) & 2047); + *(out + 6) = base + ((*in32 >> 2) & 2047); + *(out + 7) = base + ((*in32 >> 13) & 2047); /* remaining: 8 bits */ return 11; } -static uint32_t -pack12_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack12_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 12; tmp |= (*(in + 2) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (12 - 4); + tmp = (*(in + 2) - base) >> (12 - 4); tmp |= (*(in + 3) - base) << 4; tmp |= (*(in + 4) - base) << 16; tmp |= (*(in + 5) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (12 - 8); + tmp = (*(in + 5) - base) >> (12 - 8); tmp |= (*(in + 6) - base) << 8; tmp |= (*(in + 7) - base) << 20; /* remaining: 0 bits */ @@ -10148,53 +9953,52 @@ pack12_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 12; } -static uint32_t -unpack12_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack12_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4095); - *(out + 1) = base + ((*in32 >> 12) & 4095); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 4095); + *(out + 1) = base + ((*in32 >> 12) & 4095); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (12 - 4); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 4) & 4095); - *(out + 4) = base + ((*in32 >> 16) & 4095); + *(out + 3) = base + ((*in32 >> 4) & 4095); + *(out + 4) = base + ((*in32 >> 16) & 4095); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (12 - 8); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 8) & 4095); - *(out + 7) = base + ((*in32 >> 20) & 4095); + *(out + 6) = base + ((*in32 >> 8) & 4095); + *(out + 7) = base + ((*in32 >> 20) & 4095); /* remaining: 0 bits */ return 12; } -static uint32_t -pack13_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack13_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 13; tmp |= (*(in + 2) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (13 - 7); + tmp = (*(in + 2) - base) >> (13 - 7); tmp |= (*(in + 3) - base) << 7; tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (13 - 1); + tmp = (*(in + 4) - base) >> (13 - 1); tmp |= (*(in + 5) - base) << 1; tmp |= (*(in + 6) - base) << 14; tmp |= (*(in + 7) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 7) - base) >> (13 - 8); + tmp = (*(in + 7) - base) >> (13 - 8); /* remaining: 24 bits */ length = (32 / 8) - (32 - 8) / 8; /* consumed: 1 bytes (total: 13) */ @@ -10202,25 +10006,25 @@ pack13_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 13; } -static uint32_t -unpack13_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack13_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8191); - *(out + 1) = base + ((*in32 >> 13) & 8191); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 8191); + *(out + 1) = base + ((*in32 >> 13) & 8191); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 7)) << (13 - 7); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 7) & 8191); + *(out + 3) = base + ((*in32 >> 7) & 8191); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 1)) << (13 - 1); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 1) & 8191); - *(out + 6) = base + ((*in32 >> 14) & 8191); + *(out + 5) = base + ((*in32 >> 1) & 8191); + *(out + 6) = base + ((*in32 >> 14) & 8191); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 16) */ @@ -10230,28 +10034,27 @@ unpack13_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 13; } -static uint32_t -pack14_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack14_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 14; tmp |= (*(in + 2) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (14 - 10); + tmp = (*(in + 2) - base) >> (14 - 10); tmp |= (*(in + 3) - base) << 10; tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (14 - 6); + tmp = (*(in + 4) - base) >> (14 - 6); tmp |= (*(in + 5) - base) << 6; tmp |= (*(in + 6) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (14 - 2); + tmp = (*(in + 6) - base) >> (14 - 2); tmp |= (*(in + 7) - base) << 2; /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; @@ -10260,56 +10063,55 @@ pack14_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 14; } -static uint32_t -unpack14_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack14_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16383); - *(out + 1) = base + ((*in32 >> 14) & 16383); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 16383); + *(out + 1) = base + ((*in32 >> 14) & 16383); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 10)) << (14 - 10); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 10) & 16383); + *(out + 3) = base + ((*in32 >> 10) & 16383); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 6)) << (14 - 6); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 6) & 16383); + *(out + 5) = base + ((*in32 >> 6) & 16383); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 2)) << (14 - 2); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 2) & 16383); + *(out + 7) = base + ((*in32 >> 2) & 16383); /* remaining: 16 bits */ return 14; } -static uint32_t -pack15_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack15_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 15; tmp |= (*(in + 2) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (15 - 13); + tmp = (*(in + 2) - base) >> (15 - 13); tmp |= (*(in + 3) - base) << 13; tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (15 - 11); + tmp = (*(in + 4) - base) >> (15 - 11); tmp |= (*(in + 5) - base) << 11; tmp |= (*(in + 6) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (15 - 9); + tmp = (*(in + 6) - base) >> (15 - 9); tmp |= (*(in + 7) - base) << 9; /* remaining: 8 bits */ length = (32 / 8) - (32 - 24) / 8; @@ -10318,53 +10120,52 @@ pack15_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 15; } -static uint32_t -unpack15_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack15_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 32767); - *(out + 1) = base + ((*in32 >> 15) & 32767); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 32767); + *(out + 1) = base + ((*in32 >> 15) & 32767); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 13)) << (15 - 13); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 13) & 32767); + *(out + 3) = base + ((*in32 >> 13) & 32767); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 11)) << (15 - 11); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 11) & 32767); + *(out + 5) = base + ((*in32 >> 11) & 32767); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 9)) << (15 - 9); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 9) & 32767); + *(out + 7) = base + ((*in32 >> 9) & 32767); /* remaining: 8 bits */ return 15; } -static uint32_t -pack16_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack16_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) << 0; + tmp = (*(in + 2) - base) << 0; tmp |= (*(in + 3) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) << 0; + tmp = (*(in + 6) - base) << 0; tmp |= (*(in + 7) - base) << 16; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -10373,55 +10174,54 @@ pack16_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 16; } -static uint32_t -unpack16_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack16_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 65535); - *(out + 1) = base + ((*in32 >> 16) & 65535); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 65535); + *(out + 1) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 8) */ - *(out + 2) = base + ((*in32 >> 0) & 65535); - *(out + 3) = base + ((*in32 >> 16) & 65535); + *(out + 2) = base + ((*in32 >> 0) & 65535); + *(out + 3) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 12) */ - *(out + 4) = base + ((*in32 >> 0) & 65535); - *(out + 5) = base + ((*in32 >> 16) & 65535); + *(out + 4) = base + ((*in32 >> 0) & 65535); + *(out + 5) = base + ((*in32 >> 16) & 65535); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 6) = base + ((*in32 >> 0) & 65535); - *(out + 7) = base + ((*in32 >> 16) & 65535); + *(out + 6) = base + ((*in32 >> 0) & 65535); + *(out + 7) = base + ((*in32 >> 16) & 65535); /* remaining: 0 bits */ return 16; } -static uint32_t -pack17_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack17_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (17 - 2); + tmp = (*(in + 1) - base) >> (17 - 2); tmp |= (*(in + 2) - base) << 2; tmp |= (*(in + 3) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (17 - 4); + tmp = (*(in + 3) - base) >> (17 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (17 - 6); + tmp = (*(in + 5) - base) >> (17 - 6); tmp |= (*(in + 6) - base) << 6; tmp |= (*(in + 7) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (17 - 8); + tmp = (*(in + 7) - base) >> (17 - 8); /* remaining: 24 bits */ length = (32 / 8) - (32 - 8) / 8; /* consumed: 1 bytes (total: 17) */ @@ -10429,29 +10229,29 @@ pack17_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 17; } -static uint32_t -unpack17_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack17_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 131071); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 131071); tmp = (*in32 >> 17); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 2)) << (17 - 2); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 2) & 131071); + *(out + 2) = base + ((*in32 >> 2) & 131071); tmp = (*in32 >> 19); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 4)) << (17 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 131071); + *(out + 4) = base + ((*in32 >> 4) & 131071); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 6)) << (17 - 6); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 6) & 131071); + *(out + 6) = base + ((*in32 >> 6) & 131071); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 20) */ @@ -10461,33 +10261,32 @@ unpack17_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 17; } -static uint32_t -pack18_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack18_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (18 - 4); + tmp = (*(in + 1) - base) >> (18 - 4); tmp |= (*(in + 2) - base) << 4; tmp |= (*(in + 3) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (18 - 8); + tmp = (*(in + 3) - base) >> (18 - 8); tmp |= (*(in + 4) - base) << 8; tmp |= (*(in + 5) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (18 - 12); + tmp = (*(in + 5) - base) >> (18 - 12); tmp |= (*(in + 6) - base) << 12; tmp |= (*(in + 7) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (18 - 16); + tmp = (*(in + 7) - base) >> (18 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 18) */ @@ -10495,29 +10294,29 @@ pack18_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 18; } -static uint32_t -unpack18_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack18_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 262143); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 262143); tmp = (*in32 >> 18); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 4)) << (18 - 4); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 4) & 262143); + *(out + 2) = base + ((*in32 >> 4) & 262143); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (18 - 8); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 262143); + *(out + 4) = base + ((*in32 >> 8) & 262143); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 12)) << (18 - 12); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 12) & 262143); + *(out + 6) = base + ((*in32 >> 12) & 262143); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ @@ -10527,32 +10326,31 @@ unpack18_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 18; } -static uint32_t -pack19_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack19_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (19 - 6); + tmp = (*(in + 1) - base) >> (19 - 6); tmp |= (*(in + 2) - base) << 6; tmp |= (*(in + 3) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (19 - 12); + tmp = (*(in + 3) - base) >> (19 - 12); tmp |= (*(in + 4) - base) << 12; tmp |= (*(in + 5) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (19 - 18); + tmp = (*(in + 5) - base) >> (19 - 18); tmp |= (*(in + 6) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (19 - 5); + tmp = (*(in + 6) - base) >> (19 - 5); tmp |= (*(in + 7) - base) << 5; /* remaining: 8 bits */ length = (32 / 8) - (32 - 24) / 8; @@ -10561,23 +10359,23 @@ pack19_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 19; } -static uint32_t -unpack19_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack19_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 524287); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 524287); tmp = (*in32 >> 19); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 6)) << (19 - 6); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 6) & 524287); + *(out + 2) = base + ((*in32 >> 6) & 524287); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 12)) << (19 - 12); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 12) & 524287); + *(out + 4) = base + ((*in32 >> 12) & 524287); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 16) */ @@ -10588,37 +10386,36 @@ unpack19_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 5)) << (19 - 5); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 5) & 524287); + *(out + 7) = base + ((*in32 >> 5) & 524287); /* remaining: 8 bits */ return 19; } -static uint32_t -pack20_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack20_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (20 - 8); + tmp = (*(in + 1) - base) >> (20 - 8); tmp |= (*(in + 2) - base) << 8; tmp |= (*(in + 3) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (20 - 16); + tmp = (*(in + 3) - base) >> (20 - 16); tmp |= (*(in + 4) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (20 - 4); + tmp = (*(in + 4) - base) >> (20 - 4); tmp |= (*(in + 5) - base) << 4; tmp |= (*(in + 6) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (20 - 12); + tmp = (*(in + 6) - base) >> (20 - 12); tmp |= (*(in + 7) - base) << 12; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -10627,17 +10424,17 @@ pack20_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 20; } -static uint32_t -unpack20_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack20_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1048575); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1048575); tmp = (*in32 >> 20); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 8)) << (20 - 8); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 8) & 1048575); + *(out + 2) = base + ((*in32 >> 8) & 1048575); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 12) */ @@ -10648,48 +10445,47 @@ unpack20_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (20 - 4); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 4) & 1048575); + *(out + 5) = base + ((*in32 >> 4) & 1048575); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 12)) << (20 - 12); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 12) & 1048575); + *(out + 7) = base + ((*in32 >> 12) & 1048575); /* remaining: 0 bits */ return 20; } -static uint32_t -pack21_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack21_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (21 - 10); + tmp = (*(in + 1) - base) >> (21 - 10); tmp |= (*(in + 2) - base) << 10; tmp |= (*(in + 3) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (21 - 20); + tmp = (*(in + 3) - base) >> (21 - 20); tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (21 - 9); + tmp = (*(in + 4) - base) >> (21 - 9); tmp |= (*(in + 5) - base) << 9; tmp |= (*(in + 6) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (21 - 19); + tmp = (*(in + 6) - base) >> (21 - 19); tmp |= (*(in + 7) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (21 - 8); + tmp = (*(in + 7) - base) >> (21 - 8); /* remaining: 24 bits */ length = (32 / 8) - (32 - 8) / 8; /* consumed: 1 bytes (total: 21) */ @@ -10697,17 +10493,17 @@ pack21_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 21; } -static uint32_t -unpack21_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack21_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2097151); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2097151); tmp = (*in32 >> 21); in32++; /* consumed: 4 bytes (total: 8) */ tmp |= (*in32 % (1U << 10)) << (21 - 10); *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 10) & 2097151); + *(out + 2) = base + ((*in32 >> 10) & 2097151); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 12) */ @@ -10718,7 +10514,7 @@ unpack21_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 9)) << (21 - 9); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 9) & 2097151); + *(out + 5) = base + ((*in32 >> 9) & 2097151); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 20) */ @@ -10733,37 +10529,36 @@ unpack21_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 21; } -static uint32_t -pack22_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack22_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (22 - 12); + tmp = (*(in + 1) - base) >> (22 - 12); tmp |= (*(in + 2) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (22 - 2); + tmp = (*(in + 2) - base) >> (22 - 2); tmp |= (*(in + 3) - base) << 2; tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (22 - 14); + tmp = (*(in + 4) - base) >> (22 - 14); tmp |= (*(in + 5) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (22 - 4); + tmp = (*(in + 5) - base) >> (22 - 4); tmp |= (*(in + 6) - base) << 4; tmp |= (*(in + 7) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (22 - 16); + tmp = (*(in + 7) - base) >> (22 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 22) */ @@ -10771,11 +10566,11 @@ pack22_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 22; } -static uint32_t -unpack22_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack22_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4194303); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 4194303); tmp = (*in32 >> 22); in32++; /* consumed: 4 bytes (total: 8) */ @@ -10786,7 +10581,7 @@ unpack22_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 2)) << (22 - 2); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 2) & 4194303); + *(out + 3) = base + ((*in32 >> 2) & 4194303); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 16) */ @@ -10797,7 +10592,7 @@ unpack22_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 4)) << (22 - 4); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 4194303); + *(out + 6) = base + ((*in32 >> 4) & 4194303); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 24) */ @@ -10807,36 +10602,35 @@ unpack22_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 22; } -static uint32_t -pack23_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack23_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (23 - 14); + tmp = (*(in + 1) - base) >> (23 - 14); tmp |= (*(in + 2) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (23 - 5); + tmp = (*(in + 2) - base) >> (23 - 5); tmp |= (*(in + 3) - base) << 5; tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (23 - 19); + tmp = (*(in + 4) - base) >> (23 - 19); tmp |= (*(in + 5) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (23 - 10); + tmp = (*(in + 5) - base) >> (23 - 10); tmp |= (*(in + 6) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (23 - 1); + tmp = (*(in + 6) - base) >> (23 - 1); tmp |= (*(in + 7) - base) << 1; /* remaining: 8 bits */ length = (32 / 8) - (32 - 24) / 8; @@ -10845,11 +10639,11 @@ pack23_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 23; } -static uint32_t -unpack23_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack23_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8388607); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 8388607); tmp = (*in32 >> 23); in32++; /* consumed: 4 bytes (total: 8) */ @@ -10860,7 +10654,7 @@ unpack23_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 5)) << (23 - 5); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 5) & 8388607); + *(out + 3) = base + ((*in32 >> 5) & 8388607); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 16) */ @@ -10876,40 +10670,39 @@ unpack23_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 1)) << (23 - 1); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 1) & 8388607); + *(out + 7) = base + ((*in32 >> 1) & 8388607); /* remaining: 8 bits */ return 23; } -static uint32_t -pack24_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack24_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (24 - 16); + tmp = (*(in + 1) - base) >> (24 - 16); tmp |= (*(in + 2) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (24 - 8); + tmp = (*(in + 2) - base) >> (24 - 8); tmp |= (*(in + 3) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; tmp |= (*(in + 5) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (24 - 16); + tmp = (*(in + 5) - base) >> (24 - 16); tmp |= (*(in + 6) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (24 - 8); + tmp = (*(in + 6) - base) >> (24 - 8); tmp |= (*(in + 7) - base) << 8; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -10918,11 +10711,11 @@ pack24_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 24; } -static uint32_t -unpack24_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack24_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16777215); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 8) */ @@ -10933,10 +10726,10 @@ unpack24_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 12) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 8) & 16777215); + *(out + 3) = base + ((*in32 >> 8) & 16777215); in32++; /* consumed: 4 bytes (total: 16) */ - *(out + 4) = base + ((*in32 >> 0) & 16777215); + *(out + 4) = base + ((*in32 >> 0) & 16777215); tmp = (*in32 >> 24); in32++; /* consumed: 4 bytes (total: 20) */ @@ -10947,46 +10740,45 @@ unpack24_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 8)) << (24 - 8); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 8) & 16777215); + *(out + 7) = base + ((*in32 >> 8) & 16777215); /* remaining: 0 bits */ return 24; } -static uint32_t -pack25_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack25_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (25 - 18); + tmp = (*(in + 1) - base) >> (25 - 18); tmp |= (*(in + 2) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (25 - 11); + tmp = (*(in + 2) - base) >> (25 - 11); tmp |= (*(in + 3) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (25 - 4); + tmp = (*(in + 3) - base) >> (25 - 4); tmp |= (*(in + 4) - base) << 4; tmp |= (*(in + 5) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (25 - 22); + tmp = (*(in + 5) - base) >> (25 - 22); tmp |= (*(in + 6) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (25 - 15); + tmp = (*(in + 6) - base) >> (25 - 15); tmp |= (*(in + 7) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (25 - 8); + tmp = (*(in + 7) - base) >> (25 - 8); /* remaining: 24 bits */ length = (32 / 8) - (32 - 8) / 8; /* consumed: 1 bytes (total: 25) */ @@ -10994,11 +10786,11 @@ pack25_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 25; } -static uint32_t -unpack25_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack25_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 33554431); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 33554431); tmp = (*in32 >> 25); in32++; /* consumed: 4 bytes (total: 8) */ @@ -11014,7 +10806,7 @@ unpack25_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 16) */ tmp |= (*in32 % (1U << 4)) << (25 - 4); *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 33554431); + *(out + 4) = base + ((*in32 >> 4) & 33554431); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 20) */ @@ -11034,41 +10826,40 @@ unpack25_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 25; } -static uint32_t -pack26_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack26_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (26 - 20); + tmp = (*(in + 1) - base) >> (26 - 20); tmp |= (*(in + 2) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (26 - 14); + tmp = (*(in + 2) - base) >> (26 - 14); tmp |= (*(in + 3) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (26 - 8); + tmp = (*(in + 3) - base) >> (26 - 8); tmp |= (*(in + 4) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (26 - 2); + tmp = (*(in + 4) - base) >> (26 - 2); tmp |= (*(in + 5) - base) << 2; tmp |= (*(in + 6) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (26 - 22); + tmp = (*(in + 6) - base) >> (26 - 22); tmp |= (*(in + 7) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (26 - 16); + tmp = (*(in + 7) - base) >> (26 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 26) */ @@ -11076,11 +10867,11 @@ pack26_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 26; } -static uint32_t -unpack26_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack26_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 67108863); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 67108863); tmp = (*in32 >> 26); in32++; /* consumed: 4 bytes (total: 8) */ @@ -11101,7 +10892,7 @@ unpack26_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 20) */ tmp |= (*in32 % (1U << 2)) << (26 - 2); *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 2) & 67108863); + *(out + 5) = base + ((*in32 >> 2) & 67108863); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 24) */ @@ -11116,41 +10907,40 @@ unpack26_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 26; } -static uint32_t -pack27_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack27_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (27 - 22); + tmp = (*(in + 1) - base) >> (27 - 22); tmp |= (*(in + 2) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (27 - 17); + tmp = (*(in + 2) - base) >> (27 - 17); tmp |= (*(in + 3) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (27 - 12); + tmp = (*(in + 3) - base) >> (27 - 12); tmp |= (*(in + 4) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (27 - 7); + tmp = (*(in + 4) - base) >> (27 - 7); tmp |= (*(in + 5) - base) << 7; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (27 - 2); + tmp = (*(in + 5) - base) >> (27 - 2); tmp |= (*(in + 6) - base) << 2; tmp |= (*(in + 7) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (27 - 24); + tmp = (*(in + 7) - base) >> (27 - 24); /* remaining: 8 bits */ length = (32 / 8) - (32 - 24) / 8; /* consumed: 3 bytes (total: 27) */ @@ -11158,11 +10948,11 @@ pack27_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 27; } -static uint32_t -unpack27_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack27_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 134217727); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 134217727); tmp = (*in32 >> 27); in32++; /* consumed: 4 bytes (total: 8) */ @@ -11188,7 +10978,7 @@ unpack27_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 24) */ tmp |= (*in32 % (1U << 2)) << (27 - 2); *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 134217727); + *(out + 6) = base + ((*in32 >> 2) & 134217727); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 28) */ @@ -11198,40 +10988,39 @@ unpack27_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 27; } -static uint32_t -pack28_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack28_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (28 - 24); + tmp = (*(in + 1) - base) >> (28 - 24); tmp |= (*(in + 2) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (28 - 20); + tmp = (*(in + 2) - base) >> (28 - 20); tmp |= (*(in + 3) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (28 - 16); + tmp = (*(in + 3) - base) >> (28 - 16); tmp |= (*(in + 4) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (28 - 12); + tmp = (*(in + 4) - base) >> (28 - 12); tmp |= (*(in + 5) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (28 - 8); + tmp = (*(in + 5) - base) >> (28 - 8); tmp |= (*(in + 6) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (28 - 4); + tmp = (*(in + 6) - base) >> (28 - 4); tmp |= (*(in + 7) - base) << 4; /* remaining: 0 bits */ length = (32 / 8) - (32 - 32) / 8; @@ -11240,11 +11029,11 @@ pack28_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 28; } -static uint32_t -unpack28_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack28_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 268435455); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 268435455); tmp = (*in32 >> 28); in32++; /* consumed: 4 bytes (total: 8) */ @@ -11275,50 +11064,49 @@ unpack28_8(uint32_t base, const uint8_t *in, uint32_t *out) { /* consumed: 4 bytes (total: 28) */ tmp |= (*in32 % (1U << 4)) << (28 - 4); *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 4) & 268435455); + *(out + 7) = base + ((*in32 >> 4) & 268435455); /* remaining: 0 bits */ return 28; } -static uint32_t -pack29_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack29_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (29 - 26); + tmp = (*(in + 1) - base) >> (29 - 26); tmp |= (*(in + 2) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (29 - 23); + tmp = (*(in + 2) - base) >> (29 - 23); tmp |= (*(in + 3) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (29 - 20); + tmp = (*(in + 3) - base) >> (29 - 20); tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (29 - 17); + tmp = (*(in + 4) - base) >> (29 - 17); tmp |= (*(in + 5) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (29 - 14); + tmp = (*(in + 5) - base) >> (29 - 14); tmp |= (*(in + 6) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (29 - 11); + tmp = (*(in + 6) - base) >> (29 - 11); tmp |= (*(in + 7) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (29 - 8); + tmp = (*(in + 7) - base) >> (29 - 8); /* remaining: 24 bits */ length = (32 / 8) - (32 - 8) / 8; /* consumed: 1 bytes (total: 29) */ @@ -11326,11 +11114,11 @@ pack29_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 29; } -static uint32_t -unpack29_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack29_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 536870911); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 536870911); tmp = (*in32 >> 29); in32++; /* consumed: 4 bytes (total: 8) */ @@ -11370,45 +11158,44 @@ unpack29_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 29; } -static uint32_t -pack30_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack30_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (30 - 28); + tmp = (*(in + 1) - base) >> (30 - 28); tmp |= (*(in + 2) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (30 - 26); + tmp = (*(in + 2) - base) >> (30 - 26); tmp |= (*(in + 3) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (30 - 24); + tmp = (*(in + 3) - base) >> (30 - 24); tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (30 - 22); + tmp = (*(in + 4) - base) >> (30 - 22); tmp |= (*(in + 5) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (30 - 20); + tmp = (*(in + 5) - base) >> (30 - 20); tmp |= (*(in + 6) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (30 - 18); + tmp = (*(in + 6) - base) >> (30 - 18); tmp |= (*(in + 7) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (30 - 16); + tmp = (*(in + 7) - base) >> (30 - 16); /* remaining: 16 bits */ length = (32 / 8) - (32 - 16) / 8; /* consumed: 2 bytes (total: 30) */ @@ -11416,11 +11203,11 @@ pack30_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 30; } -static uint32_t -unpack30_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack30_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1073741823); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 1073741823); tmp = (*in32 >> 30); in32++; /* consumed: 4 bytes (total: 8) */ @@ -11460,45 +11247,44 @@ unpack30_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 30; } -static uint32_t -pack31_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack31_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; tmp |= (*(in + 1) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (31 - 30); + tmp = (*(in + 1) - base) >> (31 - 30); tmp |= (*(in + 2) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (31 - 29); + tmp = (*(in + 2) - base) >> (31 - 29); tmp |= (*(in + 3) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (31 - 28); + tmp = (*(in + 3) - base) >> (31 - 28); tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (31 - 27); + tmp = (*(in + 4) - base) >> (31 - 27); tmp |= (*(in + 5) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (31 - 26); + tmp = (*(in + 5) - base) >> (31 - 26); tmp |= (*(in + 6) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (31 - 25); + tmp = (*(in + 6) - base) >> (31 - 25); tmp |= (*(in + 7) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (31 - 24); + tmp = (*(in + 7) - base) >> (31 - 24); /* remaining: 8 bits */ length = (32 / 8) - (32 - 24) / 8; /* consumed: 3 bytes (total: 31) */ @@ -11506,11 +11292,11 @@ pack31_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 31; } -static uint32_t -unpack31_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack31_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2147483647); + uint32_t tmp; + (void)tmp; + *(out + 0) = base + ((*in32 >> 0) & 2147483647); tmp = (*in32 >> 31); in32++; /* consumed: 4 bytes (total: 8) */ @@ -11550,8 +11336,7 @@ unpack31_8(uint32_t base, const uint8_t *in, uint32_t *out) { return 31; } -static uint32_t -pack32_8(uint32_t base, const uint32_t *in, uint8_t *out) { +static uint32_t pack32_8(uint32_t base, const uint32_t *in, uint8_t *out) { uint32_t i; uint32_t *out32 = (uint32_t *)out; for (i = 0; i < 8; i++) @@ -11559,8 +11344,7 @@ pack32_8(uint32_t base, const uint32_t *in, uint8_t *out) { return 8 * sizeof(uint32_t); } -static uint32_t -unpack32_8(uint32_t base, const uint8_t *in, uint32_t *out) { +static uint32_t unpack32_8(uint32_t base, const uint8_t *in, uint32_t *out) { uint32_t i; uint32_t *in32 = (uint32_t *)in; for (i = 0; i < 8; i++) @@ -11569,83 +11353,26 @@ unpack32_8(uint32_t base, const uint8_t *in, uint32_t *out) { } for_packfunc_t for_pack8[33] = { - pack0_n, - pack1_8, - pack2_8, - pack3_8, - pack4_8, - pack5_8, - pack6_8, - pack7_8, - pack8_8, - pack9_8, - pack10_8, - pack11_8, - pack12_8, - pack13_8, - pack14_8, - pack15_8, - pack16_8, - pack17_8, - pack18_8, - pack19_8, - pack20_8, - pack21_8, - pack22_8, - pack23_8, - pack24_8, - pack25_8, - pack26_8, - pack27_8, - pack28_8, - pack29_8, - pack30_8, - pack31_8, - pack32_8 -}; + pack0_n, pack1_8, pack2_8, pack3_8, pack4_8, pack5_8, pack6_8, + pack7_8, pack8_8, pack9_8, pack10_8, pack11_8, pack12_8, pack13_8, + pack14_8, pack15_8, pack16_8, pack17_8, pack18_8, pack19_8, pack20_8, + pack21_8, pack22_8, pack23_8, pack24_8, pack25_8, pack26_8, pack27_8, + pack28_8, pack29_8, pack30_8, pack31_8, pack32_8}; for_unpackfunc_t for_unpack8[33] = { - unpack0_8, - unpack1_8, - unpack2_8, - unpack3_8, - unpack4_8, - unpack5_8, - unpack6_8, - unpack7_8, - unpack8_8, - unpack9_8, - unpack10_8, - unpack11_8, - unpack12_8, - unpack13_8, - unpack14_8, - unpack15_8, - unpack16_8, - unpack17_8, - unpack18_8, - unpack19_8, - unpack20_8, - unpack21_8, - unpack22_8, - unpack23_8, - unpack24_8, - unpack25_8, - unpack26_8, - unpack27_8, - unpack28_8, - unpack29_8, - unpack30_8, - unpack31_8, - unpack32_8 -}; - -static uint32_t -pack1_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { + unpack0_8, unpack1_8, unpack2_8, unpack3_8, unpack4_8, unpack5_8, + unpack6_8, unpack7_8, unpack8_8, unpack9_8, unpack10_8, unpack11_8, + unpack12_8, unpack13_8, unpack14_8, unpack15_8, unpack16_8, unpack17_8, + unpack18_8, unpack19_8, unpack20_8, unpack21_8, unpack22_8, unpack23_8, + unpack24_8, unpack25_8, unpack26_8, unpack27_8, unpack28_8, unpack29_8, + unpack30_8, unpack31_8, unpack32_8}; + +static uint32_t pack1_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 1; @@ -11671,51 +11398,52 @@ pack1_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 1) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 1) + 7) / 8; } -static uint32_t -unpack1_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack1_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 1); + *(out + 0) = base + ((tmp >> 0) & 1); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 1) & 1); + *(out + 1) = base + ((tmp >> 1) & 1); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 2) & 1); + *(out + 2) = base + ((tmp >> 2) & 1); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 3) & 1); + *(out + 3) = base + ((tmp >> 3) & 1); if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 4) & 1); + *(out + 4) = base + ((tmp >> 4) & 1); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 5) & 1); + *(out + 5) = base + ((tmp >> 5) & 1); if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 6) & 1); + *(out + 6) = base + ((tmp >> 6) & 1); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 7) & 1); + *(out + 7) = base + ((tmp >> 7) & 1); if (length == 8) goto bail; bail: return ((length * 1) + 7) / 8; } -static uint32_t -pack2_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack2_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 2; @@ -11741,51 +11469,52 @@ pack2_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 2) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 2) + 7) / 8; } -static uint32_t -unpack2_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack2_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 3); + *(out + 0) = base + ((tmp >> 0) & 3); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 2) & 3); + *(out + 1) = base + ((tmp >> 2) & 3); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 4) & 3); + *(out + 2) = base + ((tmp >> 4) & 3); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 6) & 3); + *(out + 3) = base + ((tmp >> 6) & 3); if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 8) & 3); + *(out + 4) = base + ((tmp >> 8) & 3); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 10) & 3); + *(out + 5) = base + ((tmp >> 10) & 3); if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 12) & 3); + *(out + 6) = base + ((tmp >> 12) & 3); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 14) & 3); + *(out + 7) = base + ((tmp >> 14) & 3); if (length == 8) goto bail; bail: return ((length * 2) + 7) / 8; } -static uint32_t -pack3_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack3_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 3; @@ -11811,51 +11540,52 @@ pack3_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 3) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 3) + 7) / 8; } -static uint32_t -unpack3_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack3_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 7); + *(out + 0) = base + ((tmp >> 0) & 7); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 3) & 7); + *(out + 1) = base + ((tmp >> 3) & 7); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 6) & 7); + *(out + 2) = base + ((tmp >> 6) & 7); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 9) & 7); + *(out + 3) = base + ((tmp >> 9) & 7); if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 12) & 7); + *(out + 4) = base + ((tmp >> 12) & 7); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 15) & 7); + *(out + 5) = base + ((tmp >> 15) & 7); if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 18) & 7); + *(out + 6) = base + ((tmp >> 18) & 7); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 21) & 7); + *(out + 7) = base + ((tmp >> 21) & 7); if (length == 8) goto bail; bail: return ((length * 3) + 7) / 8; } -static uint32_t -pack4_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack4_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 4; @@ -11881,51 +11611,52 @@ pack4_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 4) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 4) + 7) / 8; } -static uint32_t -unpack4_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack4_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 15); + *(out + 0) = base + ((tmp >> 0) & 15); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 4) & 15); + *(out + 1) = base + ((tmp >> 4) & 15); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 8) & 15); + *(out + 2) = base + ((tmp >> 8) & 15); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 12) & 15); + *(out + 3) = base + ((tmp >> 12) & 15); if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 16) & 15); + *(out + 4) = base + ((tmp >> 16) & 15); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 20) & 15); + *(out + 5) = base + ((tmp >> 20) & 15); if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 24) & 15); + *(out + 6) = base + ((tmp >> 24) & 15); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 28) & 15); + *(out + 7) = base + ((tmp >> 28) & 15); if (length == 8) goto bail; bail: return ((length * 4) + 7) / 8; } -static uint32_t -pack5_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack5_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 5; @@ -11946,7 +11677,7 @@ pack5_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 6) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (5 - 3); + tmp = (*(in + 6) - base) >> (5 - 3); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 3; @@ -11954,55 +11685,56 @@ pack5_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 5) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 5) + 7) / 8; } -static uint32_t -unpack5_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack5_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 31); + *(out + 0) = base + ((tmp >> 0) & 31); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 5) & 31); + *(out + 1) = base + ((tmp >> 5) & 31); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 10) & 31); + *(out + 2) = base + ((tmp >> 10) & 31); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 15) & 31); + *(out + 3) = base + ((tmp >> 15) & 31); if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 20) & 31); + *(out + 4) = base + ((tmp >> 20) & 31); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 25) & 31); + *(out + 5) = base + ((tmp >> 25) & 31); if (length == 6) goto bail; - *(out + 6) = tmp >> 30; + *(out + 6) = tmp >> 30; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 3)) << (5 - 3); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 3) & 31); + *(out + 7) = base + ((tmp >> 3) & 31); if (length == 8) goto bail; bail: return ((length * 5) + 7) / 8; } -static uint32_t -pack6_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack6_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 6; @@ -12020,7 +11752,7 @@ pack6_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 5) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (6 - 4); + tmp = (*(in + 5) - base) >> (6 - 4); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 4; @@ -12031,55 +11763,56 @@ pack6_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 6) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 6) + 7) / 8; } -static uint32_t -unpack6_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack6_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 63); + *(out + 0) = base + ((tmp >> 0) & 63); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 6) & 63); + *(out + 1) = base + ((tmp >> 6) & 63); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 12) & 63); + *(out + 2) = base + ((tmp >> 12) & 63); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 18) & 63); + *(out + 3) = base + ((tmp >> 18) & 63); if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 24) & 63); + *(out + 4) = base + ((tmp >> 24) & 63); if (length == 5) goto bail; - *(out + 5) = tmp >> 30; + *(out + 5) = tmp >> 30; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 4)) << (6 - 4); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 4) & 63); + *(out + 6) = base + ((tmp >> 4) & 63); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 10) & 63); + *(out + 7) = base + ((tmp >> 10) & 63); if (length == 8) goto bail; bail: return ((length * 6) + 7) / 8; } -static uint32_t -pack7_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack7_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 7; @@ -12094,7 +11827,7 @@ pack7_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (7 - 3); + tmp = (*(in + 4) - base) >> (7 - 3); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 3; @@ -12108,55 +11841,56 @@ pack7_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 7) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 7) + 7) / 8; } -static uint32_t -unpack7_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack7_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 127); + *(out + 0) = base + ((tmp >> 0) & 127); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 7) & 127); + *(out + 1) = base + ((tmp >> 7) & 127); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 14) & 127); + *(out + 2) = base + ((tmp >> 14) & 127); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 21) & 127); + *(out + 3) = base + ((tmp >> 21) & 127); if (length == 4) goto bail; - *(out + 4) = tmp >> 28; + *(out + 4) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 3)) << (7 - 3); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 3) & 127); + *(out + 5) = base + ((tmp >> 3) & 127); if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 10) & 127); + *(out + 6) = base + ((tmp >> 10) & 127); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 17) & 127); + *(out + 7) = base + ((tmp >> 17) & 127); if (length == 8) goto bail; bail: return ((length * 7) + 7) / 8; } -static uint32_t -pack8_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack8_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 8; @@ -12170,7 +11904,7 @@ pack8_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 8; @@ -12184,53 +11918,54 @@ pack8_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 8) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 8) + 7) / 8; } -static uint32_t -unpack8_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack8_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 255); + *(out + 0) = base + ((tmp >> 0) & 255); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 8) & 255); + *(out + 1) = base + ((tmp >> 8) & 255); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 16) & 255); + *(out + 2) = base + ((tmp >> 16) & 255); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 24) & 255); + *(out + 3) = base + ((tmp >> 24) & 255); if (length == 4) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 4) = base + ((tmp >> 0) & 255); + *(out + 4) = base + ((tmp >> 0) & 255); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 8) & 255); + *(out + 5) = base + ((tmp >> 8) & 255); if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 16) & 255); + *(out + 6) = base + ((tmp >> 16) & 255); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 24) & 255); + *(out + 7) = base + ((tmp >> 24) & 255); if (length == 8) goto bail; bail: return ((length * 8) + 7) / 8; } -static uint32_t -pack9_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack9_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 9; @@ -12242,7 +11977,7 @@ pack9_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 3) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (9 - 4); + tmp = (*(in + 3) - base) >> (9 - 4); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 4; @@ -12257,48 +11992,49 @@ pack9_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 7) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (9 - 8); + tmp = (*(in + 7) - base) >> (9 - 8); if (length == 8) goto bail; bail: remaining = (((length * 9) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 9) + 7) / 8; } -static uint32_t -unpack9_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack9_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 511); + *(out + 0) = base + ((tmp >> 0) & 511); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 9) & 511); + *(out + 1) = base + ((tmp >> 9) & 511); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 18) & 511); + *(out + 2) = base + ((tmp >> 18) & 511); if (length == 3) goto bail; - *(out + 3) = tmp >> 27; + *(out + 3) = tmp >> 27; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 4)) << (9 - 4); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 4) & 511); + *(out + 4) = base + ((tmp >> 4) & 511); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 13) & 511); + *(out + 5) = base + ((tmp >> 13) & 511); if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 22) & 511); + *(out + 6) = base + ((tmp >> 22) & 511); if (length == 7) goto bail; - *(out + 7) = tmp >> 31; + *(out + 7) = tmp >> 31; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 8)) << (9 - 8); @@ -12309,12 +12045,12 @@ unpack9_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 9) + 7) / 8; } -static uint32_t -pack10_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack10_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 10; @@ -12326,7 +12062,7 @@ pack10_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 3) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (10 - 8); + tmp = (*(in + 3) - base) >> (10 - 8); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 8; @@ -12338,7 +12074,7 @@ pack10_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 6) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (10 - 6); + tmp = (*(in + 6) - base) >> (10 - 6); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 6; @@ -12346,59 +12082,60 @@ pack10_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 10) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 10) + 7) / 8; } -static uint32_t -unpack10_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack10_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 1023); + *(out + 0) = base + ((tmp >> 0) & 1023); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 10) & 1023); + *(out + 1) = base + ((tmp >> 10) & 1023); if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 20) & 1023); + *(out + 2) = base + ((tmp >> 20) & 1023); if (length == 3) goto bail; - *(out + 3) = tmp >> 30; + *(out + 3) = tmp >> 30; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 8)) << (10 - 8); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 8) & 1023); + *(out + 4) = base + ((tmp >> 8) & 1023); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 18) & 1023); + *(out + 5) = base + ((tmp >> 18) & 1023); if (length == 6) goto bail; - *(out + 6) = tmp >> 28; + *(out + 6) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 6)) << (10 - 6); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 6) & 1023); + *(out + 7) = base + ((tmp >> 6) & 1023); if (length == 8) goto bail; bail: return ((length * 10) + 7) / 8; } -static uint32_t -pack11_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack11_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 11; @@ -12407,7 +12144,7 @@ pack11_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 2) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (11 - 1); + tmp = (*(in + 2) - base) >> (11 - 1); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 1; @@ -12419,7 +12156,7 @@ pack11_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 5) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (11 - 2); + tmp = (*(in + 5) - base) >> (11 - 2); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 2; @@ -12430,59 +12167,60 @@ pack11_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 11) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 11) + 7) / 8; } -static uint32_t -unpack11_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack11_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 2047); + *(out + 0) = base + ((tmp >> 0) & 2047); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 11) & 2047); + *(out + 1) = base + ((tmp >> 11) & 2047); if (length == 2) goto bail; - *(out + 2) = tmp >> 22; + *(out + 2) = tmp >> 22; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 1)) << (11 - 1); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 1) & 2047); + *(out + 3) = base + ((tmp >> 1) & 2047); if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 12) & 2047); + *(out + 4) = base + ((tmp >> 12) & 2047); if (length == 5) goto bail; - *(out + 5) = tmp >> 23; + *(out + 5) = tmp >> 23; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 2)) << (11 - 2); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 2) & 2047); + *(out + 6) = base + ((tmp >> 2) & 2047); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 13) & 2047); + *(out + 7) = base + ((tmp >> 13) & 2047); if (length == 8) goto bail; bail: return ((length * 11) + 7) / 8; } -static uint32_t -pack12_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack12_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 12; @@ -12491,7 +12229,7 @@ pack12_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 2) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (12 - 4); + tmp = (*(in + 2) - base) >> (12 - 4); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 4; @@ -12503,7 +12241,7 @@ pack12_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 5) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (12 - 8); + tmp = (*(in + 5) - base) >> (12 - 8); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 8; @@ -12514,59 +12252,60 @@ pack12_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 12) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 12) + 7) / 8; } -static uint32_t -unpack12_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack12_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 4095); + *(out + 0) = base + ((tmp >> 0) & 4095); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 12) & 4095); + *(out + 1) = base + ((tmp >> 12) & 4095); if (length == 2) goto bail; - *(out + 2) = tmp >> 24; + *(out + 2) = tmp >> 24; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 4)) << (12 - 4); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 4) & 4095); + *(out + 3) = base + ((tmp >> 4) & 4095); if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 16) & 4095); + *(out + 4) = base + ((tmp >> 16) & 4095); if (length == 5) goto bail; - *(out + 5) = tmp >> 28; + *(out + 5) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 8)) << (12 - 8); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 8) & 4095); + *(out + 6) = base + ((tmp >> 8) & 4095); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 20) & 4095); + *(out + 7) = base + ((tmp >> 20) & 4095); if (length == 8) goto bail; bail: return ((length * 12) + 7) / 8; } -static uint32_t -pack13_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack13_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 13; @@ -12575,7 +12314,7 @@ pack13_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 2) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (13 - 7); + tmp = (*(in + 2) - base) >> (13 - 7); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 7; @@ -12584,7 +12323,7 @@ pack13_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (13 - 1); + tmp = (*(in + 4) - base) >> (13 - 1); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 1; @@ -12596,52 +12335,53 @@ pack13_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 7) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (13 - 8); + tmp = (*(in + 7) - base) >> (13 - 8); if (length == 8) goto bail; bail: remaining = (((length * 13) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 13) + 7) / 8; } -static uint32_t -unpack13_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack13_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 8191); + *(out + 0) = base + ((tmp >> 0) & 8191); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 13) & 8191); + *(out + 1) = base + ((tmp >> 13) & 8191); if (length == 2) goto bail; - *(out + 2) = tmp >> 26; + *(out + 2) = tmp >> 26; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 7)) << (13 - 7); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 7) & 8191); + *(out + 3) = base + ((tmp >> 7) & 8191); if (length == 4) goto bail; - *(out + 4) = tmp >> 20; + *(out + 4) = tmp >> 20; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 1)) << (13 - 1); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 1) & 8191); + *(out + 5) = base + ((tmp >> 1) & 8191); if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 14) & 8191); + *(out + 6) = base + ((tmp >> 14) & 8191); if (length == 7) goto bail; - *(out + 7) = tmp >> 27; + *(out + 7) = tmp >> 27; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 8)) << (13 - 8); @@ -12652,12 +12392,12 @@ unpack13_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 13) + 7) / 8; } -static uint32_t -pack14_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack14_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 14; @@ -12666,7 +12406,7 @@ pack14_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 2) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (14 - 10); + tmp = (*(in + 2) - base) >> (14 - 10); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 10; @@ -12675,7 +12415,7 @@ pack14_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (14 - 6); + tmp = (*(in + 4) - base) >> (14 - 6); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 6; @@ -12684,7 +12424,7 @@ pack14_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 6) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (14 - 2); + tmp = (*(in + 6) - base) >> (14 - 2); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 2; @@ -12692,63 +12432,64 @@ pack14_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 14) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 14) + 7) / 8; } -static uint32_t -unpack14_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack14_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 16383); + *(out + 0) = base + ((tmp >> 0) & 16383); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 14) & 16383); + *(out + 1) = base + ((tmp >> 14) & 16383); if (length == 2) goto bail; - *(out + 2) = tmp >> 28; + *(out + 2) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 10)) << (14 - 10); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 10) & 16383); + *(out + 3) = base + ((tmp >> 10) & 16383); if (length == 4) goto bail; - *(out + 4) = tmp >> 24; + *(out + 4) = tmp >> 24; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 6)) << (14 - 6); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 6) & 16383); + *(out + 5) = base + ((tmp >> 6) & 16383); if (length == 6) goto bail; - *(out + 6) = tmp >> 20; + *(out + 6) = tmp >> 20; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 2)) << (14 - 2); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 2) & 16383); + *(out + 7) = base + ((tmp >> 2) & 16383); if (length == 8) goto bail; bail: return ((length * 14) + 7) / 8; } -static uint32_t -pack15_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack15_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 15; @@ -12757,7 +12498,7 @@ pack15_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 2) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (15 - 13); + tmp = (*(in + 2) - base) >> (15 - 13); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 13; @@ -12766,7 +12507,7 @@ pack15_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (15 - 11); + tmp = (*(in + 4) - base) >> (15 - 11); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 11; @@ -12775,7 +12516,7 @@ pack15_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 6) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (15 - 9); + tmp = (*(in + 6) - base) >> (15 - 9); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 9; @@ -12783,63 +12524,64 @@ pack15_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 15) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 15) + 7) / 8; } -static uint32_t -unpack15_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack15_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 32767); + *(out + 0) = base + ((tmp >> 0) & 32767); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 15) & 32767); + *(out + 1) = base + ((tmp >> 15) & 32767); if (length == 2) goto bail; - *(out + 2) = tmp >> 30; + *(out + 2) = tmp >> 30; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 13)) << (15 - 13); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 13) & 32767); + *(out + 3) = base + ((tmp >> 13) & 32767); if (length == 4) goto bail; - *(out + 4) = tmp >> 28; + *(out + 4) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 11)) << (15 - 11); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 11) & 32767); + *(out + 5) = base + ((tmp >> 11) & 32767); if (length == 6) goto bail; - *(out + 6) = tmp >> 26; + *(out + 6) = tmp >> 26; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 9)) << (15 - 9); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 9) & 32767); + *(out + 7) = base + ((tmp >> 9) & 32767); if (length == 8) goto bail; bail: return ((length * 15) + 7) / 8; } -static uint32_t -pack16_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack16_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 16; @@ -12847,7 +12589,7 @@ pack16_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) << 0; + tmp = (*(in + 2) - base) << 0; if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 16; @@ -12855,7 +12597,7 @@ pack16_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 16; @@ -12863,7 +12605,7 @@ pack16_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) << 0; + tmp = (*(in + 6) - base) << 0; if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 16; @@ -12871,63 +12613,64 @@ pack16_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 16) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 16) + 7) / 8; } -static uint32_t -unpack16_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack16_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 65535); + *(out + 0) = base + ((tmp >> 0) & 65535); if (length == 1) goto bail; - *(out + 1) = base + ((tmp >> 16) & 65535); + *(out + 1) = base + ((tmp >> 16) & 65535); if (length == 2) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 2) = base + ((tmp >> 0) & 65535); + *(out + 2) = base + ((tmp >> 0) & 65535); if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 16) & 65535); + *(out + 3) = base + ((tmp >> 16) & 65535); if (length == 4) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 4) = base + ((tmp >> 0) & 65535); + *(out + 4) = base + ((tmp >> 0) & 65535); if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 16) & 65535); + *(out + 5) = base + ((tmp >> 16) & 65535); if (length == 6) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 6) = base + ((tmp >> 0) & 65535); + *(out + 6) = base + ((tmp >> 0) & 65535); if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 16) & 65535); + *(out + 7) = base + ((tmp >> 16) & 65535); if (length == 8) goto bail; bail: return ((length * 16) + 7) / 8; } -static uint32_t -pack17_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack17_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (17 - 2); + tmp = (*(in + 1) - base) >> (17 - 2); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 2; @@ -12936,7 +12679,7 @@ pack17_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 3) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (17 - 4); + tmp = (*(in + 3) - base) >> (17 - 4); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 4; @@ -12945,7 +12688,7 @@ pack17_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 5) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (17 - 6); + tmp = (*(in + 5) - base) >> (17 - 6); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 6; @@ -12954,56 +12697,57 @@ pack17_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 7) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (17 - 8); + tmp = (*(in + 7) - base) >> (17 - 8); if (length == 8) goto bail; bail: remaining = (((length * 17) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 17) + 7) / 8; } -static uint32_t -unpack17_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack17_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 131071); + *(out + 0) = base + ((tmp >> 0) & 131071); if (length == 1) goto bail; - *(out + 1) = tmp >> 17; + *(out + 1) = tmp >> 17; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 2)) << (17 - 2); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 2) & 131071); + *(out + 2) = base + ((tmp >> 2) & 131071); if (length == 3) goto bail; - *(out + 3) = tmp >> 19; + *(out + 3) = tmp >> 19; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 4)) << (17 - 4); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 4) & 131071); + *(out + 4) = base + ((tmp >> 4) & 131071); if (length == 5) goto bail; - *(out + 5) = tmp >> 21; + *(out + 5) = tmp >> 21; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 6)) << (17 - 6); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 6) & 131071); + *(out + 6) = base + ((tmp >> 6) & 131071); if (length == 7) goto bail; - *(out + 7) = tmp >> 23; + *(out + 7) = tmp >> 23; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 8)) << (17 - 8); @@ -13014,18 +12758,18 @@ unpack17_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 17) + 7) / 8; } -static uint32_t -pack18_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack18_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (18 - 4); + tmp = (*(in + 1) - base) >> (18 - 4); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 4; @@ -13034,7 +12778,7 @@ pack18_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 3) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (18 - 8); + tmp = (*(in + 3) - base) >> (18 - 8); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 8; @@ -13043,7 +12787,7 @@ pack18_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 5) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (18 - 12); + tmp = (*(in + 5) - base) >> (18 - 12); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 12; @@ -13052,56 +12796,57 @@ pack18_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 7) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (18 - 16); + tmp = (*(in + 7) - base) >> (18 - 16); if (length == 8) goto bail; bail: remaining = (((length * 18) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 18) + 7) / 8; } -static uint32_t -unpack18_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack18_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 262143); + *(out + 0) = base + ((tmp >> 0) & 262143); if (length == 1) goto bail; - *(out + 1) = tmp >> 18; + *(out + 1) = tmp >> 18; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 4)) << (18 - 4); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 4) & 262143); + *(out + 2) = base + ((tmp >> 4) & 262143); if (length == 3) goto bail; - *(out + 3) = tmp >> 22; + *(out + 3) = tmp >> 22; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 8)) << (18 - 8); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 8) & 262143); + *(out + 4) = base + ((tmp >> 8) & 262143); if (length == 5) goto bail; - *(out + 5) = tmp >> 26; + *(out + 5) = tmp >> 26; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 12)) << (18 - 12); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 12) & 262143); + *(out + 6) = base + ((tmp >> 12) & 262143); if (length == 7) goto bail; - *(out + 7) = tmp >> 30; + *(out + 7) = tmp >> 30; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 16)) << (18 - 16); @@ -13112,18 +12857,18 @@ unpack18_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 18) + 7) / 8; } -static uint32_t -pack19_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack19_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (19 - 6); + tmp = (*(in + 1) - base) >> (19 - 6); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 6; @@ -13132,7 +12877,7 @@ pack19_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 3) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (19 - 12); + tmp = (*(in + 3) - base) >> (19 - 12); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 12; @@ -13141,13 +12886,13 @@ pack19_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 5) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (19 - 18); + tmp = (*(in + 5) - base) >> (19 - 18); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (19 - 5); + tmp = (*(in + 6) - base) >> (19 - 5); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 5; @@ -13155,73 +12900,74 @@ pack19_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 19) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 19) + 7) / 8; } -static uint32_t -unpack19_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack19_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 524287); + *(out + 0) = base + ((tmp >> 0) & 524287); if (length == 1) goto bail; - *(out + 1) = tmp >> 19; + *(out + 1) = tmp >> 19; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 6)) << (19 - 6); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 6) & 524287); + *(out + 2) = base + ((tmp >> 6) & 524287); if (length == 3) goto bail; - *(out + 3) = tmp >> 25; + *(out + 3) = tmp >> 25; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 12)) << (19 - 12); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 12) & 524287); + *(out + 4) = base + ((tmp >> 12) & 524287); if (length == 5) goto bail; - *(out + 5) = tmp >> 31; + *(out + 5) = tmp >> 31; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 18)) << (19 - 18); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = tmp >> 18; + *(out + 6) = tmp >> 18; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 5)) << (19 - 5); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 5) & 524287); + *(out + 7) = base + ((tmp >> 5) & 524287); if (length == 8) goto bail; bail: return ((length * 19) + 7) / 8; } -static uint32_t -pack20_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack20_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (20 - 8); + tmp = (*(in + 1) - base) >> (20 - 8); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 8; @@ -13230,13 +12976,13 @@ pack20_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 3) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (20 - 16); + tmp = (*(in + 3) - base) >> (20 - 16); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (20 - 4); + tmp = (*(in + 4) - base) >> (20 - 4); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 4; @@ -13245,7 +12991,7 @@ pack20_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 6) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (20 - 12); + tmp = (*(in + 6) - base) >> (20 - 12); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 12; @@ -13253,73 +12999,74 @@ pack20_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 20) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 20) + 7) / 8; } -static uint32_t -unpack20_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack20_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 1048575); + *(out + 0) = base + ((tmp >> 0) & 1048575); if (length == 1) goto bail; - *(out + 1) = tmp >> 20; + *(out + 1) = tmp >> 20; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 8)) << (20 - 8); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 8) & 1048575); + *(out + 2) = base + ((tmp >> 8) & 1048575); if (length == 3) goto bail; - *(out + 3) = tmp >> 28; + *(out + 3) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 16)) << (20 - 16); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = tmp >> 16; + *(out + 4) = tmp >> 16; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 4)) << (20 - 4); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 4) & 1048575); + *(out + 5) = base + ((tmp >> 4) & 1048575); if (length == 6) goto bail; - *(out + 6) = tmp >> 24; + *(out + 6) = tmp >> 24; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 12)) << (20 - 12); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 12) & 1048575); + *(out + 7) = base + ((tmp >> 12) & 1048575); if (length == 8) goto bail; bail: return ((length * 20) + 7) / 8; } -static uint32_t -pack21_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack21_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 21; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (21 - 10); + tmp = (*(in + 1) - base) >> (21 - 10); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 10; @@ -13328,13 +13075,13 @@ pack21_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 3) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (21 - 20); + tmp = (*(in + 3) - base) >> (21 - 20); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (21 - 9); + tmp = (*(in + 4) - base) >> (21 - 9); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 9; @@ -13343,66 +13090,67 @@ pack21_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 6) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (21 - 19); + tmp = (*(in + 6) - base) >> (21 - 19); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (21 - 8); + tmp = (*(in + 7) - base) >> (21 - 8); if (length == 8) goto bail; bail: remaining = (((length * 21) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 21) + 7) / 8; } -static uint32_t -unpack21_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack21_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 2097151); + *(out + 0) = base + ((tmp >> 0) & 2097151); if (length == 1) goto bail; - *(out + 1) = tmp >> 21; + *(out + 1) = tmp >> 21; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 10)) << (21 - 10); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = base + ((tmp >> 10) & 2097151); + *(out + 2) = base + ((tmp >> 10) & 2097151); if (length == 3) goto bail; - *(out + 3) = tmp >> 31; + *(out + 3) = tmp >> 31; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 20)) << (21 - 20); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = tmp >> 20; + *(out + 4) = tmp >> 20; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 9)) << (21 - 9); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 9) & 2097151); + *(out + 5) = base + ((tmp >> 9) & 2097151); if (length == 6) goto bail; - *(out + 6) = tmp >> 30; + *(out + 6) = tmp >> 30; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 19)) << (21 - 19); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = tmp >> 19; + *(out + 7) = tmp >> 19; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 8)) << (21 - 8); @@ -13413,24 +13161,24 @@ unpack21_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 21) + 7) / 8; } -static uint32_t -pack22_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack22_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (22 - 12); + tmp = (*(in + 1) - base) >> (22 - 12); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (22 - 2); + tmp = (*(in + 2) - base) >> (22 - 2); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 2; @@ -13439,13 +13187,13 @@ pack22_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (22 - 14); + tmp = (*(in + 4) - base) >> (22 - 14); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (22 - 4); + tmp = (*(in + 5) - base) >> (22 - 4); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 4; @@ -13454,60 +13202,61 @@ pack22_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 7) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (22 - 16); + tmp = (*(in + 7) - base) >> (22 - 16); if (length == 8) goto bail; bail: remaining = (((length * 22) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 22) + 7) / 8; } -static uint32_t -unpack22_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack22_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 4194303); + *(out + 0) = base + ((tmp >> 0) & 4194303); if (length == 1) goto bail; - *(out + 1) = tmp >> 22; + *(out + 1) = tmp >> 22; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 12)) << (22 - 12); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 12; + *(out + 2) = tmp >> 12; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 2)) << (22 - 2); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 2) & 4194303); + *(out + 3) = base + ((tmp >> 2) & 4194303); if (length == 4) goto bail; - *(out + 4) = tmp >> 24; + *(out + 4) = tmp >> 24; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 14)) << (22 - 14); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = tmp >> 14; + *(out + 5) = tmp >> 14; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 4)) << (22 - 4); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 4) & 4194303); + *(out + 6) = base + ((tmp >> 4) & 4194303); if (length == 7) goto bail; - *(out + 7) = tmp >> 26; + *(out + 7) = tmp >> 26; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 16)) << (22 - 16); @@ -13518,24 +13267,24 @@ unpack22_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 22) + 7) / 8; } -static uint32_t -pack23_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack23_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (23 - 14); + tmp = (*(in + 1) - base) >> (23 - 14); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (23 - 5); + tmp = (*(in + 2) - base) >> (23 - 5); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 5; @@ -13544,19 +13293,19 @@ pack23_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (23 - 19); + tmp = (*(in + 4) - base) >> (23 - 19); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 19; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (23 - 10); + tmp = (*(in + 5) - base) >> (23 - 10); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 10; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (23 - 1); + tmp = (*(in + 6) - base) >> (23 - 1); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 1; @@ -13564,83 +13313,84 @@ pack23_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 23) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 23) + 7) / 8; } -static uint32_t -unpack23_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack23_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 8388607); + *(out + 0) = base + ((tmp >> 0) & 8388607); if (length == 1) goto bail; - *(out + 1) = tmp >> 23; + *(out + 1) = tmp >> 23; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 14)) << (23 - 14); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 14; + *(out + 2) = tmp >> 14; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 5)) << (23 - 5); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 5) & 8388607); + *(out + 3) = base + ((tmp >> 5) & 8388607); if (length == 4) goto bail; - *(out + 4) = tmp >> 28; + *(out + 4) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 19)) << (23 - 19); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = tmp >> 19; + *(out + 5) = tmp >> 19; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 10)) << (23 - 10); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = tmp >> 10; + *(out + 6) = tmp >> 10; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 1)) << (23 - 1); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 1) & 8388607); + *(out + 7) = base + ((tmp >> 1) & 8388607); if (length == 8) goto bail; bail: return ((length * 23) + 7) / 8; } -static uint32_t -pack24_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack24_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (24 - 16); + tmp = (*(in + 1) - base) >> (24 - 16); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (24 - 8); + tmp = (*(in + 2) - base) >> (24 - 8); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 8; @@ -13648,19 +13398,19 @@ pack24_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (24 - 16); + tmp = (*(in + 5) - base) >> (24 - 16); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (24 - 8); + tmp = (*(in + 6) - base) >> (24 - 8); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 8; @@ -13668,87 +13418,88 @@ pack24_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 24) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 24) + 7) / 8; } -static uint32_t -unpack24_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack24_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 16777215); + *(out + 0) = base + ((tmp >> 0) & 16777215); if (length == 1) goto bail; - *(out + 1) = tmp >> 24; + *(out + 1) = tmp >> 24; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 16)) << (24 - 16); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 16; + *(out + 2) = tmp >> 16; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 8)) << (24 - 8); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = base + ((tmp >> 8) & 16777215); + *(out + 3) = base + ((tmp >> 8) & 16777215); if (length == 4) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 4) = base + ((tmp >> 0) & 16777215); + *(out + 4) = base + ((tmp >> 0) & 16777215); if (length == 5) goto bail; - *(out + 5) = tmp >> 24; + *(out + 5) = tmp >> 24; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 16)) << (24 - 16); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = tmp >> 16; + *(out + 6) = tmp >> 16; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 8)) << (24 - 8); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 8) & 16777215); + *(out + 7) = base + ((tmp >> 8) & 16777215); if (length == 8) goto bail; bail: return ((length * 24) + 7) / 8; } -static uint32_t -pack25_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack25_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (25 - 18); + tmp = (*(in + 1) - base) >> (25 - 18); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (25 - 11); + tmp = (*(in + 2) - base) >> (25 - 11); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (25 - 4); + tmp = (*(in + 3) - base) >> (25 - 4); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 4; @@ -13757,76 +13508,77 @@ pack25_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 5) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (25 - 22); + tmp = (*(in + 5) - base) >> (25 - 22); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (25 - 15); + tmp = (*(in + 6) - base) >> (25 - 15); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 15; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (25 - 8); + tmp = (*(in + 7) - base) >> (25 - 8); if (length == 8) goto bail; bail: remaining = (((length * 25) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 25) + 7) / 8; } -static uint32_t -unpack25_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack25_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 33554431); + *(out + 0) = base + ((tmp >> 0) & 33554431); if (length == 1) goto bail; - *(out + 1) = tmp >> 25; + *(out + 1) = tmp >> 25; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 18)) << (25 - 18); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 18; + *(out + 2) = tmp >> 18; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 11)) << (25 - 11); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = tmp >> 11; + *(out + 3) = tmp >> 11; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 4)) << (25 - 4); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = base + ((tmp >> 4) & 33554431); + *(out + 4) = base + ((tmp >> 4) & 33554431); if (length == 5) goto bail; - *(out + 5) = tmp >> 29; + *(out + 5) = tmp >> 29; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 22)) << (25 - 22); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = tmp >> 22; + *(out + 6) = tmp >> 22; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 15)) << (25 - 15); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = tmp >> 15; + *(out + 7) = tmp >> 15; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 8)) << (25 - 8); @@ -13837,36 +13589,36 @@ unpack25_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 25) + 7) / 8; } -static uint32_t -pack26_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack26_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (26 - 20); + tmp = (*(in + 1) - base) >> (26 - 20); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (26 - 14); + tmp = (*(in + 2) - base) >> (26 - 14); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (26 - 8); + tmp = (*(in + 3) - base) >> (26 - 8); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (26 - 2); + tmp = (*(in + 4) - base) >> (26 - 2); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 2; @@ -13875,70 +13627,71 @@ pack26_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 6) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (26 - 22); + tmp = (*(in + 6) - base) >> (26 - 22); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (26 - 16); + tmp = (*(in + 7) - base) >> (26 - 16); if (length == 8) goto bail; bail: remaining = (((length * 26) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 26) + 7) / 8; } -static uint32_t -unpack26_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack26_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 67108863); + *(out + 0) = base + ((tmp >> 0) & 67108863); if (length == 1) goto bail; - *(out + 1) = tmp >> 26; + *(out + 1) = tmp >> 26; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 20)) << (26 - 20); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 20; + *(out + 2) = tmp >> 20; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 14)) << (26 - 14); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = tmp >> 14; + *(out + 3) = tmp >> 14; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 8)) << (26 - 8); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = tmp >> 8; + *(out + 4) = tmp >> 8; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 2)) << (26 - 2); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = base + ((tmp >> 2) & 67108863); + *(out + 5) = base + ((tmp >> 2) & 67108863); if (length == 6) goto bail; - *(out + 6) = tmp >> 28; + *(out + 6) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 22)) << (26 - 22); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = tmp >> 22; + *(out + 7) = tmp >> 22; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 16)) << (26 - 16); @@ -13949,42 +13702,42 @@ unpack26_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 26) + 7) / 8; } -static uint32_t -pack27_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack27_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (27 - 22); + tmp = (*(in + 1) - base) >> (27 - 22); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (27 - 17); + tmp = (*(in + 2) - base) >> (27 - 17); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (27 - 12); + tmp = (*(in + 3) - base) >> (27 - 12); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (27 - 7); + tmp = (*(in + 4) - base) >> (27 - 7); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 7; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (27 - 2); + tmp = (*(in + 5) - base) >> (27 - 2); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 2; @@ -13993,64 +13746,65 @@ pack27_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { tmp |= (*(in + 7) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (27 - 24); + tmp = (*(in + 7) - base) >> (27 - 24); if (length == 8) goto bail; bail: remaining = (((length * 27) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 27) + 7) / 8; } -static uint32_t -unpack27_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack27_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 134217727); + *(out + 0) = base + ((tmp >> 0) & 134217727); if (length == 1) goto bail; - *(out + 1) = tmp >> 27; + *(out + 1) = tmp >> 27; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 22)) << (27 - 22); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 22; + *(out + 2) = tmp >> 22; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 17)) << (27 - 17); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = tmp >> 17; + *(out + 3) = tmp >> 17; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 12)) << (27 - 12); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = tmp >> 12; + *(out + 4) = tmp >> 12; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 7)) << (27 - 7); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = tmp >> 7; + *(out + 5) = tmp >> 7; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 2)) << (27 - 2); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = base + ((tmp >> 2) & 134217727); + *(out + 6) = base + ((tmp >> 2) & 134217727); if (length == 7) goto bail; - *(out + 7) = tmp >> 29; + *(out + 7) = tmp >> 29; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 24)) << (27 - 24); @@ -14061,48 +13815,48 @@ unpack27_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 27) + 7) / 8; } -static uint32_t -pack28_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack28_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (28 - 24); + tmp = (*(in + 1) - base) >> (28 - 24); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (28 - 20); + tmp = (*(in + 2) - base) >> (28 - 20); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (28 - 16); + tmp = (*(in + 3) - base) >> (28 - 16); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 16; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (28 - 12); + tmp = (*(in + 4) - base) >> (28 - 12); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 12; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (28 - 8); + tmp = (*(in + 5) - base) >> (28 - 8); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 8; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (28 - 4); + tmp = (*(in + 6) - base) >> (28 - 4); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 4; @@ -14110,178 +13864,180 @@ pack28_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { goto bail; bail: remaining = (((length * 28) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 28) + 7) / 8; } -static uint32_t -unpack28_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack28_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 268435455); + *(out + 0) = base + ((tmp >> 0) & 268435455); if (length == 1) goto bail; - *(out + 1) = tmp >> 28; + *(out + 1) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 24)) << (28 - 24); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 24; + *(out + 2) = tmp >> 24; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 20)) << (28 - 20); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = tmp >> 20; + *(out + 3) = tmp >> 20; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 16)) << (28 - 16); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = tmp >> 16; + *(out + 4) = tmp >> 16; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 12)) << (28 - 12); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = tmp >> 12; + *(out + 5) = tmp >> 12; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 8)) << (28 - 8); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = tmp >> 8; + *(out + 6) = tmp >> 8; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 4)) << (28 - 4); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = base + ((tmp >> 4) & 268435455); + *(out + 7) = base + ((tmp >> 4) & 268435455); if (length == 8) goto bail; bail: return ((length * 28) + 7) / 8; } -static uint32_t -pack29_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack29_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (29 - 26); + tmp = (*(in + 1) - base) >> (29 - 26); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (29 - 23); + tmp = (*(in + 2) - base) >> (29 - 23); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 23; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (29 - 20); + tmp = (*(in + 3) - base) >> (29 - 20); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (29 - 17); + tmp = (*(in + 4) - base) >> (29 - 17); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 17; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (29 - 14); + tmp = (*(in + 5) - base) >> (29 - 14); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 14; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (29 - 11); + tmp = (*(in + 6) - base) >> (29 - 11); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 11; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (29 - 8); + tmp = (*(in + 7) - base) >> (29 - 8); if (length == 8) goto bail; bail: remaining = (((length * 29) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 29) + 7) / 8; } -static uint32_t -unpack29_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack29_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 536870911); + *(out + 0) = base + ((tmp >> 0) & 536870911); if (length == 1) goto bail; - *(out + 1) = tmp >> 29; + *(out + 1) = tmp >> 29; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 26)) << (29 - 26); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 26; + *(out + 2) = tmp >> 26; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 23)) << (29 - 23); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = tmp >> 23; + *(out + 3) = tmp >> 23; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 20)) << (29 - 20); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = tmp >> 20; + *(out + 4) = tmp >> 20; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 17)) << (29 - 17); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = tmp >> 17; + *(out + 5) = tmp >> 17; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 14)) << (29 - 14); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = tmp >> 14; + *(out + 6) = tmp >> 14; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 11)) << (29 - 11); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = tmp >> 11; + *(out + 7) = tmp >> 11; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 8)) << (29 - 8); @@ -14292,115 +14048,116 @@ unpack29_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 29) + 7) / 8; } -static uint32_t -pack30_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack30_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (30 - 28); + tmp = (*(in + 1) - base) >> (30 - 28); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (30 - 26); + tmp = (*(in + 2) - base) >> (30 - 26); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (30 - 24); + tmp = (*(in + 3) - base) >> (30 - 24); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 24; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (30 - 22); + tmp = (*(in + 4) - base) >> (30 - 22); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 22; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (30 - 20); + tmp = (*(in + 5) - base) >> (30 - 20); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 20; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (30 - 18); + tmp = (*(in + 6) - base) >> (30 - 18); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 18; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (30 - 16); + tmp = (*(in + 7) - base) >> (30 - 16); if (length == 8) goto bail; bail: remaining = (((length * 30) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 30) + 7) / 8; } -static uint32_t -unpack30_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack30_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 1073741823); + *(out + 0) = base + ((tmp >> 0) & 1073741823); if (length == 1) goto bail; - *(out + 1) = tmp >> 30; + *(out + 1) = tmp >> 30; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 28)) << (30 - 28); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 28; + *(out + 2) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 26)) << (30 - 26); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = tmp >> 26; + *(out + 3) = tmp >> 26; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 24)) << (30 - 24); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = tmp >> 24; + *(out + 4) = tmp >> 24; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 22)) << (30 - 22); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = tmp >> 22; + *(out + 5) = tmp >> 22; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 20)) << (30 - 20); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = tmp >> 20; + *(out + 6) = tmp >> 20; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 18)) << (30 - 18); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = tmp >> 18; + *(out + 7) = tmp >> 18; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 16)) << (30 - 16); @@ -14411,115 +14168,116 @@ unpack30_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 30) + 7) / 8; } -static uint32_t -pack31_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack31_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; tmp |= (*(in + 1) - base) << 31; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (31 - 30); + tmp = (*(in + 1) - base) >> (31 - 30); if (length == 2) goto bail; tmp |= (*(in + 2) - base) << 30; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (31 - 29); + tmp = (*(in + 2) - base) >> (31 - 29); if (length == 3) goto bail; tmp |= (*(in + 3) - base) << 29; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (31 - 28); + tmp = (*(in + 3) - base) >> (31 - 28); if (length == 4) goto bail; tmp |= (*(in + 4) - base) << 28; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (31 - 27); + tmp = (*(in + 4) - base) >> (31 - 27); if (length == 5) goto bail; tmp |= (*(in + 5) - base) << 27; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (31 - 26); + tmp = (*(in + 5) - base) >> (31 - 26); if (length == 6) goto bail; tmp |= (*(in + 6) - base) << 26; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (31 - 25); + tmp = (*(in + 6) - base) >> (31 - 25); if (length == 7) goto bail; tmp |= (*(in + 7) - base) << 25; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (31 - 24); + tmp = (*(in + 7) - base) >> (31 - 24); if (length == 8) goto bail; bail: remaining = (((length * 31) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 31) + 7) / 8; } -static uint32_t -unpack31_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack31_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 2147483647); + *(out + 0) = base + ((tmp >> 0) & 2147483647); if (length == 1) goto bail; - *(out + 1) = tmp >> 31; + *(out + 1) = tmp >> 31; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 1) |= (tmp % (1U << 30)) << (31 - 30); *(out + 1) += base; if (length == 2) goto bail; - *(out + 2) = tmp >> 30; + *(out + 2) = tmp >> 30; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 2) |= (tmp % (1U << 29)) << (31 - 29); *(out + 2) += base; if (length == 3) goto bail; - *(out + 3) = tmp >> 29; + *(out + 3) = tmp >> 29; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 3) |= (tmp % (1U << 28)) << (31 - 28); *(out + 3) += base; if (length == 4) goto bail; - *(out + 4) = tmp >> 28; + *(out + 4) = tmp >> 28; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 4) |= (tmp % (1U << 27)) << (31 - 27); *(out + 4) += base; if (length == 5) goto bail; - *(out + 5) = tmp >> 27; + *(out + 5) = tmp >> 27; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 5) |= (tmp % (1U << 26)) << (31 - 26); *(out + 5) += base; if (length == 6) goto bail; - *(out + 6) = tmp >> 26; + *(out + 6) = tmp >> 26; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 6) |= (tmp % (1U << 25)) << (31 - 25); *(out + 6) += base; if (length == 7) goto bail; - *(out + 7) = tmp >> 25; + *(out + 7) = tmp >> 25; in += sizeof(uint32_t); tmp = *(uint32_t *)in; *(out + 7) |= (tmp % (1U << 24)) << (31 - 24); @@ -14530,98 +14288,99 @@ unpack31_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { return ((length * 31) + 7) / 8; } -static uint32_t -pack32_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { +static uint32_t pack32_x(uint32_t base, const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t tmp, remaining; if (length == 0) return 0; - tmp = (*(in + 0) - base) << 0; + tmp = (*(in + 0) - base) << 0; if (length == 1) goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 1) - base) << 0; + tmp = (*(in + 1) - base) << 0; if (length == 2) goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 2) - base) << 0; + tmp = (*(in + 2) - base) << 0; if (length == 3) goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 3) - base) << 0; + tmp = (*(in + 3) - base) << 0; if (length == 4) goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 4) - base) << 0; + tmp = (*(in + 4) - base) << 0; if (length == 5) goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 5) - base) << 0; + tmp = (*(in + 5) - base) << 0; if (length == 6) goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 6) - base) << 0; + tmp = (*(in + 6) - base) << 0; if (length == 7) goto bail; *(uint32_t *)out = tmp; out += sizeof(uint32_t); - tmp = (*(in + 7) - base) << 0; + tmp = (*(in + 7) - base) << 0; if (length == 8) goto bail; bail: remaining = (((length * 32) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; + if (remaining == 0) + remaining = 4; memcpy(out, &tmp, remaining); return ((length * 32) + 7) / 8; } -static uint32_t -unpack32_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { +static uint32_t unpack32_x(uint32_t base, const uint8_t *in, uint32_t *out, + uint32_t length) { uint32_t tmp; if (length == 0) return 0; tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 4294967295); + *(out + 0) = base + ((tmp >> 0) & 4294967295); if (length == 1) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 1) = base + ((tmp >> 0) & 4294967295); + *(out + 1) = base + ((tmp >> 0) & 4294967295); if (length == 2) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 2) = base + ((tmp >> 0) & 4294967295); + *(out + 2) = base + ((tmp >> 0) & 4294967295); if (length == 3) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 3) = base + ((tmp >> 0) & 4294967295); + *(out + 3) = base + ((tmp >> 0) & 4294967295); if (length == 4) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 4) = base + ((tmp >> 0) & 4294967295); + *(out + 4) = base + ((tmp >> 0) & 4294967295); if (length == 5) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 5) = base + ((tmp >> 0) & 4294967295); + *(out + 5) = base + ((tmp >> 0) & 4294967295); if (length == 6) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 6) = base + ((tmp >> 0) & 4294967295); + *(out + 6) = base + ((tmp >> 0) & 4294967295); if (length == 7) goto bail; in += sizeof(uint32_t); tmp = *(uint32_t *)in; - *(out + 7) = base + ((tmp >> 0) & 4294967295); + *(out + 7) = base + ((tmp >> 0) & 4294967295); if (length == 8) goto bail; bail: @@ -14629,79 +14388,22 @@ unpack32_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { } for_packxfunc_t for_packx[33] = { - pack0_x, - pack1_x, - pack2_x, - pack3_x, - pack4_x, - pack5_x, - pack6_x, - pack7_x, - pack8_x, - pack9_x, - pack10_x, - pack11_x, - pack12_x, - pack13_x, - pack14_x, - pack15_x, - pack16_x, - pack17_x, - pack18_x, - pack19_x, - pack20_x, - pack21_x, - pack22_x, - pack23_x, - pack24_x, - pack25_x, - pack26_x, - pack27_x, - pack28_x, - pack29_x, - pack30_x, - pack31_x, - pack32_x -}; + pack0_x, pack1_x, pack2_x, pack3_x, pack4_x, pack5_x, pack6_x, + pack7_x, pack8_x, pack9_x, pack10_x, pack11_x, pack12_x, pack13_x, + pack14_x, pack15_x, pack16_x, pack17_x, pack18_x, pack19_x, pack20_x, + pack21_x, pack22_x, pack23_x, pack24_x, pack25_x, pack26_x, pack27_x, + pack28_x, pack29_x, pack30_x, pack31_x, pack32_x}; for_unpackxfunc_t for_unpackx[33] = { - unpack0_x, - unpack1_x, - unpack2_x, - unpack3_x, - unpack4_x, - unpack5_x, - unpack6_x, - unpack7_x, - unpack8_x, - unpack9_x, - unpack10_x, - unpack11_x, - unpack12_x, - unpack13_x, - unpack14_x, - unpack15_x, - unpack16_x, - unpack17_x, - unpack18_x, - unpack19_x, - unpack20_x, - unpack21_x, - unpack22_x, - unpack23_x, - unpack24_x, - unpack25_x, - unpack26_x, - unpack27_x, - unpack28_x, - unpack29_x, - unpack30_x, - unpack31_x, - unpack32_x -}; - -static uint32_t -linsearch1_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { + unpack0_x, unpack1_x, unpack2_x, unpack3_x, unpack4_x, unpack5_x, + unpack6_x, unpack7_x, unpack8_x, unpack9_x, unpack10_x, unpack11_x, + unpack12_x, unpack13_x, unpack14_x, unpack15_x, unpack16_x, unpack17_x, + unpack18_x, unpack19_x, unpack20_x, unpack21_x, unpack22_x, unpack23_x, + unpack24_x, unpack25_x, unpack26_x, unpack27_x, unpack28_x, unpack29_x, + unpack30_x, unpack31_x, unpack32_x}; + +static uint32_t linsearch1_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -14839,8 +14541,8 @@ linsearch1_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (4); } -static uint32_t -linsearch2_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch2_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -14981,8 +14683,8 @@ linsearch2_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (8); } -static uint32_t -linsearch3_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch3_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -15128,8 +14830,8 @@ linsearch3_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (12); } -static uint32_t -linsearch4_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch4_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -15276,8 +14978,8 @@ linsearch4_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (16); } -static uint32_t -linsearch5_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch5_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -15431,8 +15133,8 @@ linsearch5_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (20); } -static uint32_t -linsearch6_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch6_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -15589,8 +15291,8 @@ linsearch6_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (24); } -static uint32_t -linsearch7_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch7_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -15752,8 +15454,8 @@ linsearch7_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (28); } -static uint32_t -linsearch8_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch8_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -15912,8 +15614,8 @@ linsearch8_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (32); } -static uint32_t -linsearch9_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch9_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -16083,8 +15785,8 @@ linsearch9_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (36); } -static uint32_t -linsearch10_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch10_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -16257,8 +15959,8 @@ linsearch10_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (40); } -static uint32_t -linsearch11_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch11_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -16436,8 +16138,8 @@ linsearch11_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (44); } -static uint32_t -linsearch12_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch12_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -16616,8 +16318,8 @@ linsearch12_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (48); } -static uint32_t -linsearch13_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch13_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -16803,8 +16505,8 @@ linsearch13_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (52); } -static uint32_t -linsearch14_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch14_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -16993,8 +16695,8 @@ linsearch14_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (56); } -static uint32_t -linsearch15_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch15_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -17188,8 +16890,8 @@ linsearch15_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (60); } -static uint32_t -linsearch16_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch16_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -17372,8 +17074,8 @@ linsearch16_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (64); } -static uint32_t -linsearch17_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch17_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -17575,8 +17277,8 @@ linsearch17_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (68); } -static uint32_t -linsearch18_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch18_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -17781,8 +17483,8 @@ linsearch18_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (72); } -static uint32_t -linsearch19_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch19_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -17992,8 +17694,8 @@ linsearch19_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (76); } -static uint32_t -linsearch20_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch20_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -18204,8 +17906,8 @@ linsearch20_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (80); } -static uint32_t -linsearch21_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch21_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -18423,8 +18125,8 @@ linsearch21_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (84); } -static uint32_t -linsearch22_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch22_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -18645,8 +18347,8 @@ linsearch22_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (88); } -static uint32_t -linsearch23_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch23_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -18872,8 +18574,8 @@ linsearch23_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (92); } -static uint32_t -linsearch24_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch24_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -19096,8 +18798,8 @@ linsearch24_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (96); } -static uint32_t -linsearch25_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch25_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -19331,8 +19033,8 @@ linsearch25_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (100); } -static uint32_t -linsearch26_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch26_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -19569,8 +19271,8 @@ linsearch26_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (104); } -static uint32_t -linsearch27_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch27_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -19812,8 +19514,8 @@ linsearch27_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (108); } -static uint32_t -linsearch28_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch28_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -20056,8 +19758,8 @@ linsearch28_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (112); } -static uint32_t -linsearch29_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch29_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -20307,8 +20009,8 @@ linsearch29_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (116); } -static uint32_t -linsearch30_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch30_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -20561,8 +20263,8 @@ linsearch30_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (120); } -static uint32_t -linsearch31_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch31_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -20820,8 +20522,8 @@ linsearch31_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (124); } -static uint32_t -linsearch32_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch32_32(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t i; uint32_t *in32 = (uint32_t *)in; value -= base; @@ -20835,43 +20537,18 @@ linsearch32_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { } for_linsearchfunc_t for_linsearch32[33] = { - linsearch0_n, - linsearch1_32, - linsearch2_32, - linsearch3_32, - linsearch4_32, - linsearch5_32, - linsearch6_32, - linsearch7_32, - linsearch8_32, - linsearch9_32, - linsearch10_32, - linsearch11_32, - linsearch12_32, - linsearch13_32, - linsearch14_32, - linsearch15_32, - linsearch16_32, - linsearch17_32, - linsearch18_32, - linsearch19_32, - linsearch20_32, - linsearch21_32, - linsearch22_32, - linsearch23_32, - linsearch24_32, - linsearch25_32, - linsearch26_32, - linsearch27_32, - linsearch28_32, - linsearch29_32, - linsearch30_32, - linsearch31_32, - linsearch32_32 -}; - -static uint32_t -linsearch1_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { + linsearch0_n, linsearch1_32, linsearch2_32, linsearch3_32, + linsearch4_32, linsearch5_32, linsearch6_32, linsearch7_32, + linsearch8_32, linsearch9_32, linsearch10_32, linsearch11_32, + linsearch12_32, linsearch13_32, linsearch14_32, linsearch15_32, + linsearch16_32, linsearch17_32, linsearch18_32, linsearch19_32, + linsearch20_32, linsearch21_32, linsearch22_32, linsearch23_32, + linsearch24_32, linsearch25_32, linsearch26_32, linsearch27_32, + linsearch28_32, linsearch29_32, linsearch30_32, linsearch31_32, + linsearch32_32}; + +static uint32_t linsearch1_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -20945,8 +20622,8 @@ linsearch1_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (2); } -static uint32_t -linsearch2_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch2_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21020,8 +20697,8 @@ linsearch2_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (4); } -static uint32_t -linsearch3_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch3_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21099,8 +20776,8 @@ linsearch3_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (6); } -static uint32_t -linsearch4_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch4_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21177,8 +20854,8 @@ linsearch4_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (8); } -static uint32_t -linsearch5_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch5_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21260,8 +20937,8 @@ linsearch5_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (10); } -static uint32_t -linsearch6_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch6_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21343,8 +21020,8 @@ linsearch6_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (12); } -static uint32_t -linsearch7_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch7_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21430,8 +21107,8 @@ linsearch7_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (14); } -static uint32_t -linsearch8_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch8_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21514,8 +21191,8 @@ linsearch8_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (16); } -static uint32_t -linsearch9_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch9_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21605,8 +21282,8 @@ linsearch9_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (18); } -static uint32_t -linsearch10_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch10_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21696,8 +21373,8 @@ linsearch10_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (20); } -static uint32_t -linsearch11_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch11_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21791,8 +21468,8 @@ linsearch11_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (22); } -static uint32_t -linsearch12_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch12_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21885,8 +21562,8 @@ linsearch12_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (24); } -static uint32_t -linsearch13_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch13_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -21984,8 +21661,8 @@ linsearch13_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (26); } -static uint32_t -linsearch14_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch14_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22083,8 +21760,8 @@ linsearch14_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (28); } -static uint32_t -linsearch15_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch15_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22186,8 +21863,8 @@ linsearch15_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (30); } -static uint32_t -linsearch16_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch16_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22282,8 +21959,8 @@ linsearch16_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (32); } -static uint32_t -linsearch17_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch17_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22389,8 +22066,8 @@ linsearch17_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (34); } -static uint32_t -linsearch18_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch18_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22496,8 +22173,8 @@ linsearch18_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (36); } -static uint32_t -linsearch19_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch19_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22607,8 +22284,8 @@ linsearch19_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (38); } -static uint32_t -linsearch20_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch20_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22717,8 +22394,8 @@ linsearch20_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (40); } -static uint32_t -linsearch21_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch21_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22832,8 +22509,8 @@ linsearch21_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (42); } -static uint32_t -linsearch22_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch22_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -22947,8 +22624,8 @@ linsearch22_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (44); } -static uint32_t -linsearch23_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch23_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -23066,8 +22743,8 @@ linsearch23_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (46); } -static uint32_t -linsearch24_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch24_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -23182,8 +22859,8 @@ linsearch24_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (48); } -static uint32_t -linsearch25_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch25_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -23305,8 +22982,8 @@ linsearch25_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (50); } -static uint32_t -linsearch26_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch26_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -23428,8 +23105,8 @@ linsearch26_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (52); } -static uint32_t -linsearch27_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch27_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -23555,8 +23232,8 @@ linsearch27_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (54); } -static uint32_t -linsearch28_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch28_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -23681,8 +23358,8 @@ linsearch28_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (56); } -static uint32_t -linsearch29_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch29_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -23812,8 +23489,8 @@ linsearch29_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (58); } -static uint32_t -linsearch30_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch30_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -23943,8 +23620,8 @@ linsearch30_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (60); } -static uint32_t -linsearch31_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch31_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24078,8 +23755,8 @@ linsearch31_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (62); } -static uint32_t -linsearch32_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch32_16(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t i; uint32_t *in32 = (uint32_t *)in; value -= base; @@ -24093,43 +23770,18 @@ linsearch32_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { } for_linsearchfunc_t for_linsearch16[33] = { - linsearch0_n, - linsearch1_16, - linsearch2_16, - linsearch3_16, - linsearch4_16, - linsearch5_16, - linsearch6_16, - linsearch7_16, - linsearch8_16, - linsearch9_16, - linsearch10_16, - linsearch11_16, - linsearch12_16, - linsearch13_16, - linsearch14_16, - linsearch15_16, - linsearch16_16, - linsearch17_16, - linsearch18_16, - linsearch19_16, - linsearch20_16, - linsearch21_16, - linsearch22_16, - linsearch23_16, - linsearch24_16, - linsearch25_16, - linsearch26_16, - linsearch27_16, - linsearch28_16, - linsearch29_16, - linsearch30_16, - linsearch31_16, - linsearch32_16 -}; - -static uint32_t -linsearch1_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { + linsearch0_n, linsearch1_16, linsearch2_16, linsearch3_16, + linsearch4_16, linsearch5_16, linsearch6_16, linsearch7_16, + linsearch8_16, linsearch9_16, linsearch10_16, linsearch11_16, + linsearch12_16, linsearch13_16, linsearch14_16, linsearch15_16, + linsearch16_16, linsearch17_16, linsearch18_16, linsearch19_16, + linsearch20_16, linsearch21_16, linsearch22_16, linsearch23_16, + linsearch24_16, linsearch25_16, linsearch26_16, linsearch27_16, + linsearch28_16, linsearch29_16, linsearch30_16, linsearch31_16, + linsearch32_16}; + +static uint32_t linsearch1_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24171,8 +23823,8 @@ linsearch1_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (1); } -static uint32_t -linsearch2_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch2_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24214,8 +23866,8 @@ linsearch2_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (2); } -static uint32_t -linsearch3_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch3_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24257,8 +23909,8 @@ linsearch3_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (3); } -static uint32_t -linsearch4_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch4_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24300,8 +23952,8 @@ linsearch4_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (4); } -static uint32_t -linsearch5_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch5_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24347,8 +23999,8 @@ linsearch5_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (5); } -static uint32_t -linsearch6_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch6_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24394,8 +24046,8 @@ linsearch6_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (6); } -static uint32_t -linsearch7_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch7_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24441,8 +24093,8 @@ linsearch7_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (7); } -static uint32_t -linsearch8_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch8_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24487,8 +24139,8 @@ linsearch8_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (8); } -static uint32_t -linsearch9_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch9_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24538,8 +24190,8 @@ linsearch9_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (9); } -static uint32_t -linsearch10_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch10_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24589,8 +24241,8 @@ linsearch10_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (10); } -static uint32_t -linsearch11_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch11_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24640,8 +24292,8 @@ linsearch11_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (11); } -static uint32_t -linsearch12_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch12_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24691,8 +24343,8 @@ linsearch12_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (12); } -static uint32_t -linsearch13_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch13_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24746,8 +24398,8 @@ linsearch13_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (13); } -static uint32_t -linsearch14_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch14_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24801,8 +24453,8 @@ linsearch14_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (14); } -static uint32_t -linsearch15_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch15_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24856,8 +24508,8 @@ linsearch15_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (15); } -static uint32_t -linsearch16_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch16_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24908,8 +24560,8 @@ linsearch16_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (16); } -static uint32_t -linsearch17_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch17_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -24967,8 +24619,8 @@ linsearch17_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (17); } -static uint32_t -linsearch18_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch18_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25026,8 +24678,8 @@ linsearch18_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (18); } -static uint32_t -linsearch19_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch19_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25085,8 +24737,8 @@ linsearch19_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (19); } -static uint32_t -linsearch20_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch20_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25144,8 +24796,8 @@ linsearch20_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (20); } -static uint32_t -linsearch21_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch21_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25207,8 +24859,8 @@ linsearch21_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (21); } -static uint32_t -linsearch22_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch22_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25270,8 +24922,8 @@ linsearch22_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (22); } -static uint32_t -linsearch23_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch23_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25333,8 +24985,8 @@ linsearch23_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (23); } -static uint32_t -linsearch24_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch24_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25395,8 +25047,8 @@ linsearch24_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (24); } -static uint32_t -linsearch25_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch25_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25462,8 +25114,8 @@ linsearch25_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (25); } -static uint32_t -linsearch26_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch26_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25529,8 +25181,8 @@ linsearch26_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (26); } -static uint32_t -linsearch27_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch27_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25596,8 +25248,8 @@ linsearch27_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (27); } -static uint32_t -linsearch28_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch28_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25663,8 +25315,8 @@ linsearch28_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (28); } -static uint32_t -linsearch29_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch29_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25734,8 +25386,8 @@ linsearch29_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (29); } -static uint32_t -linsearch30_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch30_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25805,8 +25457,8 @@ linsearch30_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (30); } -static uint32_t -linsearch31_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch31_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t tmp, tmp2; value -= base; (void)tmp2; @@ -25876,8 +25528,8 @@ linsearch31_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { return (31); } -static uint32_t -linsearch32_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { +static uint32_t linsearch32_8(uint32_t base, const uint8_t *in, uint32_t value, + int *found) { uint32_t i; uint32_t *in32 = (uint32_t *)in; value -= base; @@ -25891,43 +25543,16 @@ linsearch32_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { } for_linsearchfunc_t for_linsearch8[33] = { - linsearch0_n, - linsearch1_8, - linsearch2_8, - linsearch3_8, - linsearch4_8, - linsearch5_8, - linsearch6_8, - linsearch7_8, - linsearch8_8, - linsearch9_8, - linsearch10_8, - linsearch11_8, - linsearch12_8, - linsearch13_8, - linsearch14_8, - linsearch15_8, - linsearch16_8, - linsearch17_8, - linsearch18_8, - linsearch19_8, - linsearch20_8, - linsearch21_8, - linsearch22_8, - linsearch23_8, - linsearch24_8, - linsearch25_8, - linsearch26_8, - linsearch27_8, - linsearch28_8, - linsearch29_8, - linsearch30_8, - linsearch31_8, - linsearch32_8 -}; - -static uint32_t -linsearch1_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { + linsearch0_n, linsearch1_8, linsearch2_8, linsearch3_8, linsearch4_8, + linsearch5_8, linsearch6_8, linsearch7_8, linsearch8_8, linsearch9_8, + linsearch10_8, linsearch11_8, linsearch12_8, linsearch13_8, linsearch14_8, + linsearch15_8, linsearch16_8, linsearch17_8, linsearch18_8, linsearch19_8, + linsearch20_8, linsearch21_8, linsearch22_8, linsearch23_8, linsearch24_8, + linsearch25_8, linsearch26_8, linsearch27_8, linsearch28_8, linsearch29_8, + linsearch30_8, linsearch31_8, linsearch32_8}; + +static uint32_t linsearch1_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -25986,8 +25611,8 @@ linsearch1_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 1) + 7) / 8; } -static uint32_t -linsearch2_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch2_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26046,8 +25671,8 @@ linsearch2_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 2) + 7) / 8; } -static uint32_t -linsearch3_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch3_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26106,8 +25731,8 @@ linsearch3_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 3) + 7) / 8; } -static uint32_t -linsearch4_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch4_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26166,8 +25791,8 @@ linsearch4_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 4) + 7) / 8; } -static uint32_t -linsearch5_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch5_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26229,8 +25854,8 @@ linsearch5_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 5) + 7) / 8; } -static uint32_t -linsearch6_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch6_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26292,8 +25917,8 @@ linsearch6_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 6) + 7) / 8; } -static uint32_t -linsearch7_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch7_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26355,8 +25980,8 @@ linsearch7_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 7) + 7) / 8; } -static uint32_t -linsearch8_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch8_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26417,8 +26042,8 @@ linsearch8_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 8) + 7) / 8; } -static uint32_t -linsearch9_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch9_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26483,8 +26108,8 @@ linsearch9_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 9) + 7) / 8; } -static uint32_t -linsearch10_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch10_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26549,8 +26174,8 @@ linsearch10_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 10) + 7) / 8; } -static uint32_t -linsearch11_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch11_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26615,8 +26240,8 @@ linsearch11_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 11) + 7) / 8; } -static uint32_t -linsearch12_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch12_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26681,8 +26306,8 @@ linsearch12_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 12) + 7) / 8; } -static uint32_t -linsearch13_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch13_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26750,8 +26375,8 @@ linsearch13_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 13) + 7) / 8; } -static uint32_t -linsearch14_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch14_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26819,8 +26444,8 @@ linsearch14_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 14) + 7) / 8; } -static uint32_t -linsearch15_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch15_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26888,8 +26513,8 @@ linsearch15_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 15) + 7) / 8; } -static uint32_t -linsearch16_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch16_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -26954,8 +26579,8 @@ linsearch16_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 16) + 7) / 8; } -static uint32_t -linsearch17_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch17_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27026,8 +26651,8 @@ linsearch17_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 17) + 7) / 8; } -static uint32_t -linsearch18_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch18_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27098,8 +26723,8 @@ linsearch18_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 18) + 7) / 8; } -static uint32_t -linsearch19_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch19_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27170,8 +26795,8 @@ linsearch19_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 19) + 7) / 8; } -static uint32_t -linsearch20_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch20_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27242,8 +26867,8 @@ linsearch20_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 20) + 7) / 8; } -static uint32_t -linsearch21_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch21_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27317,8 +26942,8 @@ linsearch21_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 21) + 7) / 8; } -static uint32_t -linsearch22_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch22_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27392,8 +27017,8 @@ linsearch22_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 22) + 7) / 8; } -static uint32_t -linsearch23_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch23_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27467,8 +27092,8 @@ linsearch23_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 23) + 7) / 8; } -static uint32_t -linsearch24_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch24_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27541,8 +27166,8 @@ linsearch24_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 24) + 7) / 8; } -static uint32_t -linsearch25_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch25_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27619,8 +27244,8 @@ linsearch25_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 25) + 7) / 8; } -static uint32_t -linsearch26_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch26_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27697,8 +27322,8 @@ linsearch26_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 26) + 7) / 8; } -static uint32_t -linsearch27_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch27_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27775,8 +27400,8 @@ linsearch27_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 27) + 7) / 8; } -static uint32_t -linsearch28_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch28_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27853,8 +27478,8 @@ linsearch28_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 28) + 7) / 8; } -static uint32_t -linsearch29_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch29_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -27934,8 +27559,8 @@ linsearch29_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 29) + 7) / 8; } -static uint32_t -linsearch30_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch30_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -28015,8 +27640,8 @@ linsearch30_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 30) + 7) / 8; } -static uint32_t -linsearch31_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch31_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -28096,8 +27721,8 @@ linsearch31_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, return ((length * 31) + 7) / 8; } -static uint32_t -linsearch32_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { +static uint32_t linsearch32_x(uint32_t base, const uint8_t *in, uint32_t length, + uint32_t value, int *found) { uint32_t tmp, tmp2; (void)tmp2; if (length == 0) @@ -28171,38 +27796,10 @@ linsearch32_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, } for_linsearchxfunc_t for_linsearchx[33] = { - linsearch0_x, - linsearch1_x, - linsearch2_x, - linsearch3_x, - linsearch4_x, - linsearch5_x, - linsearch6_x, - linsearch7_x, - linsearch8_x, - linsearch9_x, - linsearch10_x, - linsearch11_x, - linsearch12_x, - linsearch13_x, - linsearch14_x, - linsearch15_x, - linsearch16_x, - linsearch17_x, - linsearch18_x, - linsearch19_x, - linsearch20_x, - linsearch21_x, - linsearch22_x, - linsearch23_x, - linsearch24_x, - linsearch25_x, - linsearch26_x, - linsearch27_x, - linsearch28_x, - linsearch29_x, - linsearch30_x, - linsearch31_x, - linsearch32_x -}; - + linsearch0_x, linsearch1_x, linsearch2_x, linsearch3_x, linsearch4_x, + linsearch5_x, linsearch6_x, linsearch7_x, linsearch8_x, linsearch9_x, + linsearch10_x, linsearch11_x, linsearch12_x, linsearch13_x, linsearch14_x, + linsearch15_x, linsearch16_x, linsearch17_x, linsearch18_x, linsearch19_x, + linsearch20_x, linsearch21_x, linsearch22_x, linsearch23_x, linsearch24_x, + linsearch25_x, linsearch26_x, linsearch27_x, linsearch28_x, linsearch29_x, + linsearch30_x, linsearch31_x, linsearch32_x}; diff --git a/src/for.c b/src/for.c index 0ca9dbe..ab1187a 100644 --- a/src/for.c +++ b/src/for.c @@ -24,14 +24,14 @@ typedef unsigned int uint32_t; typedef unsigned char uint8_t; typedef signed char int8_t; #else -# include +#include #endif -#define METADATA 5 /* size of metadata overhead */ +#define METADATA 5 /* size of metadata overhead */ #ifdef _MSC_VER -# define INLINE __inline -# include +#define INLINE __inline +#include uint32_t __inline CLZ(uint32_t value) { uint32_t leading_zero = 0; @@ -39,33 +39,29 @@ uint32_t __inline CLZ(uint32_t value) { return 31 - leading_zero; } #else -# define INLINE inline -# define CLZ __builtin_clz +#define INLINE inline +#define CLZ __builtin_clz #endif -typedef uint32_t(*for_unpackfunc_t) (uint32_t, const uint8_t *, uint32_t *); -typedef uint32_t(*for_packfunc_t) (uint32_t, const uint32_t *, uint8_t *); -typedef uint32_t(*for_unpackxfunc_t) (uint32_t, const uint8_t *, uint32_t *, - uint32_t); -typedef uint32_t(*for_packxfunc_t) (uint32_t, const uint32_t *, uint8_t *, - uint32_t); -typedef uint32_t(*for_linsearchfunc_t)(uint32_t, const uint8_t *, uint32_t, - int *); -typedef uint32_t(*for_linsearchxfunc_t)(uint32_t, const uint8_t *, uint32_t, - uint32_t, int *); +typedef uint32_t (*for_unpackfunc_t)(uint32_t, const uint8_t *, uint32_t *); +typedef uint32_t (*for_packfunc_t)(uint32_t, const uint32_t *, uint8_t *); +typedef uint32_t (*for_unpackxfunc_t)(uint32_t, const uint8_t *, uint32_t *, + uint32_t); +typedef uint32_t (*for_packxfunc_t)(uint32_t, const uint32_t *, uint8_t *, + uint32_t); +typedef uint32_t (*for_linsearchfunc_t)(uint32_t, const uint8_t *, uint32_t, + int *); +typedef uint32_t (*for_linsearchxfunc_t)(uint32_t, const uint8_t *, uint32_t, + uint32_t, int *); /* include the generated file */ #include "for-gen.c" -static INLINE uint32_t -required_bits(const uint32_t v) -{ +static INLINE uint32_t required_bits(const uint32_t v) { return v == 0 ? 0 : 32 - CLZ(v); } -uint32_t -for_compressed_size_bits(uint32_t length, uint32_t bits) -{ +uint32_t for_compressed_size_bits(uint32_t length, uint32_t bits) { uint32_t c = 0; uint32_t b; @@ -93,9 +89,7 @@ for_compressed_size_bits(uint32_t length, uint32_t bits) return c + ((length * bits) + 7) / 8; } -uint32_t -for_compressed_size_unsorted(const uint32_t *in, uint32_t length) -{ +uint32_t for_compressed_size_unsorted(const uint32_t *in, uint32_t length) { uint32_t i, b, m, M; if (length == 0) @@ -118,9 +112,7 @@ for_compressed_size_unsorted(const uint32_t *in, uint32_t length) return METADATA + for_compressed_size_bits(length, b); } -uint32_t -for_compressed_size_sorted(const uint32_t *in, uint32_t length) -{ +uint32_t for_compressed_size_sorted(const uint32_t *in, uint32_t length) { uint32_t b, m, M; if (length == 0) @@ -136,10 +128,8 @@ for_compressed_size_sorted(const uint32_t *in, uint32_t length) return METADATA + for_compressed_size_bits(length, b); } -uint32_t -for_compress_bits(const uint32_t *in, uint8_t *out, uint32_t length, - uint32_t base, uint32_t bits) -{ +uint32_t for_compress_bits(const uint32_t *in, uint8_t *out, uint32_t length, + uint32_t base, uint32_t bits) { uint32_t i = 0; uint32_t written = 0; @@ -157,9 +147,8 @@ for_compress_bits(const uint32_t *in, uint8_t *out, uint32_t length, return written + for_packx[bits](base, in, out + written, length - i); } -uint32_t -for_compress_unsorted(const uint32_t *in, uint8_t *out, uint32_t length) -{ +uint32_t for_compress_unsorted(const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t i, b, m, M; if (length == 0) @@ -181,13 +170,12 @@ for_compress_unsorted(const uint32_t *in, uint8_t *out, uint32_t length) /* store m and the bits */ *(uint32_t *)(out + 0) = m; - *(uint8_t *) (out + 4) = b; + *(uint8_t *)(out + 4) = b; return METADATA + for_compress_bits(in, out + METADATA, length, m, b); } -uint32_t -for_compress_sorted(const uint32_t *in, uint8_t *out, uint32_t length) -{ +uint32_t for_compress_sorted(const uint32_t *in, uint8_t *out, + uint32_t length) { uint32_t m, M, b; if (length == 0) @@ -202,15 +190,13 @@ for_compress_sorted(const uint32_t *in, uint8_t *out, uint32_t length) /* store m and the bits */ *(uint32_t *)(out + 0) = m; - *(uint8_t *) (out + 4) = b; + *(uint8_t *)(out + 4) = b; return METADATA + for_compress_bits(in, out + METADATA, length, m, b); } -uint32_t -for_uncompress_bits(const uint8_t *in, uint32_t *out, uint32_t length, - uint32_t base, uint32_t bits) -{ +uint32_t for_uncompress_bits(const uint8_t *in, uint32_t *out, uint32_t length, + uint32_t base, uint32_t bits) { uint32_t i = 0; const uint8_t *bin = in; @@ -228,9 +214,7 @@ for_uncompress_bits(const uint8_t *in, uint32_t *out, uint32_t length, return (uint32_t)(in - bin) + for_unpackx[bits](base, in, out, length - i); } -uint32_t -for_uncompress(const uint8_t *in, uint32_t *out, uint32_t length) -{ +uint32_t for_uncompress(const uint8_t *in, uint32_t *out, uint32_t length) { uint32_t m, b; if (length == 0) @@ -243,10 +227,8 @@ for_uncompress(const uint8_t *in, uint32_t *out, uint32_t length) return METADATA + for_uncompress_bits(in + METADATA, out, length, m, b); } -uint32_t -for_append_bits(uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value) -{ +uint32_t for_append_bits(uint8_t *in, uint32_t length, uint32_t base, + uint32_t bits, uint32_t value) { uint32_t b, start; uint8_t *initin = in; uint32_t *in32 = (uint32_t *)in; @@ -309,12 +291,11 @@ for_append_bits(uint8_t *in, uint32_t length, uint32_t base, return (uint32_t)(in - initin) + ((start + bits) + 7) / 8; } -typedef uint32_t (* append_impl)(const uint32_t *in, uint8_t *out, - uint32_t length); +typedef uint32_t (*append_impl)(const uint32_t *in, uint8_t *out, + uint32_t length); -static uint32_t -for_append_impl(uint8_t *in, uint32_t length, uint32_t value, append_impl impl) -{ +static uint32_t for_append_impl(uint8_t *in, uint32_t length, uint32_t value, + append_impl impl) { uint32_t m, b, bnew, s; if (length == 0) @@ -341,22 +322,16 @@ for_append_impl(uint8_t *in, uint32_t length, uint32_t value, append_impl impl) return METADATA + for_append_bits(in + METADATA, length, m, b, value); } -uint32_t -for_append_unsorted(uint8_t *in, uint32_t length, uint32_t value) -{ +uint32_t for_append_unsorted(uint8_t *in, uint32_t length, uint32_t value) { return for_append_impl(in, length, value, for_compress_unsorted); } -uint32_t -for_append_sorted(uint8_t *in, uint32_t length, uint32_t value) -{ +uint32_t for_append_sorted(uint8_t *in, uint32_t length, uint32_t value) { return for_append_impl(in, length, value, for_compress_sorted); } -uint32_t -for_select_bits(const uint8_t *in, uint32_t base, uint32_t bits, - uint32_t index) -{ +uint32_t for_select_bits(const uint8_t *in, uint32_t base, uint32_t bits, + uint32_t index) { uint32_t b, start; const uint32_t *in32; @@ -405,14 +380,12 @@ for_select_bits(const uint8_t *in, uint32_t base, uint32_t bits, uint32_t mask1 = (1 << bits) - 1; uint32_t mask2 = (1 << (bits - (32 - start))) - 1; uint32_t v1 = (*(in32 + 0) >> start) & mask1; - uint32_t v2 = *(in32 + 1) & mask2; + uint32_t v2 = *(in32 + 1) & mask2; return base + ((v2 << (32 - start)) | v1); } } -uint32_t -for_select(const uint8_t *in, uint32_t index) -{ +uint32_t for_select(const uint8_t *in, uint32_t index) { /* load min and the bits */ uint32_t m = *(uint32_t *)(in + 0); uint32_t b = *(in + 4); @@ -420,9 +393,7 @@ for_select(const uint8_t *in, uint32_t index) return for_select_bits(in + METADATA, m, b, index); } -uint32_t -for_linear_search(const uint8_t *in, uint32_t length, uint32_t value) -{ +uint32_t for_linear_search(const uint8_t *in, uint32_t length, uint32_t value) { /* load min and the bits */ uint32_t m = *(uint32_t *)(in + 0); uint32_t b = *(in + 4); @@ -430,10 +401,8 @@ for_linear_search(const uint8_t *in, uint32_t length, uint32_t value) return for_linear_search_bits(in + METADATA, length, m, b, value); } -uint32_t -for_linear_search_bits(const uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value) -{ +uint32_t for_linear_search_bits(const uint8_t *in, uint32_t length, + uint32_t base, uint32_t bits, uint32_t value) { uint32_t i = 0; int found = -1; @@ -467,23 +436,20 @@ for_linear_search_bits(const uint8_t *in, uint32_t length, uint32_t base, return length; } -uint32_t -for_lower_bound_search(const uint8_t *in, uint32_t length, uint32_t value, - uint32_t *actual) -{ +uint32_t for_lower_bound_search(const uint8_t *in, uint32_t length, + uint32_t value, uint32_t *actual) { /* load min and the bits */ uint32_t m = *(uint32_t *)(in + 0); uint32_t b = *(in + 4); - return for_lower_bound_search_bits(in + METADATA, length, m, b, - value, actual); + return for_lower_bound_search_bits(in + METADATA, length, m, b, value, + actual); } /* adapted from wikipedia */ -uint32_t -for_lower_bound_search_bits(const uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value, uint32_t *actual) -{ +uint32_t for_lower_bound_search_bits(const uint8_t *in, uint32_t length, + uint32_t base, uint32_t bits, + uint32_t value, uint32_t *actual) { uint32_t imid; uint32_t imin = 0; uint32_t imax = length - 1; @@ -495,8 +461,7 @@ for_lower_bound_search_bits(const uint8_t *in, uint32_t length, uint32_t base, v = for_select_bits(in, base, bits, imid); if (v >= value) { imax = imid; - } - else if (v < value) { + } else if (v < value) { imin = imid; } } @@ -511,4 +476,3 @@ for_lower_bound_search_bits(const uint8_t *in, uint32_t length, uint32_t base, *actual = v; return imax; } - diff --git a/src/frameofreference.cpp b/src/frameofreference.cpp index 5a7ae90..64b347f 100644 --- a/src/frameofreference.cpp +++ b/src/frameofreference.cpp @@ -1,27226 +1,28033 @@ #include "frameofreference.h" - -static __m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t * in, int length, __m128i * out, const uint32_t bit) { - int k; - int inwordpointer; - __m128i P; - uint32_t firstpass; - __m128i offset; - if(bit == 0) return out;/* nothing to do */ - if(bit == 32) { - memcpy(out,in,length*sizeof(uint32_t)); - return (__m128i *) ((uint32_t *) out + length) ; +static __m128i *simdpackFOR_length(uint32_t initvalue, const uint32_t *in, + int length, __m128i *out, + const uint32_t bit) { + int k; + int inwordpointer; + __m128i P; + uint32_t firstpass; + __m128i offset; + if (bit == 0) + return out; /* nothing to do */ + if (bit == 32) { + memcpy(out, in, length * sizeof(uint32_t)); + return (__m128i *)((uint32_t *)out + length); + } + offset = _mm_set1_epi32(initvalue); + inwordpointer = 0; + P = _mm_setzero_si128(); + for (k = 0; k < length / 4; ++k) { + __m128i value = + _mm_sub_epi32(_mm_loadu_si128(((const __m128i *)in + k)), offset); + P = _mm_or_si128(P, _mm_slli_epi32(value, inwordpointer)); + firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + _mm_storeu_si128(out++, P); + P = _mm_srli_epi32(value, firstpass); + inwordpointer = bit - firstpass; } - offset = _mm_set1_epi32(initvalue); - inwordpointer = 0; - P = _mm_setzero_si128(); - for(k = 0; k < length / 4 ; ++k) { - __m128i value = _mm_sub_epi32(_mm_loadu_si128(((const __m128i * ) in + k)),offset); - P = _mm_or_si128(P,_mm_slli_epi32(value, inwordpointer)); - firstpass = sizeof(uint32_t) * 8 - inwordpointer; - if(bit((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack2_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack3_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 3 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 3 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack4_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack5_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 5 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 5 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 5 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 5 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack6_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 6 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 6 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 6 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 6 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack7_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 7 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 7 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 7 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 7 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 7 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 7 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack8_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack9_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 9 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 9 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 9 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 9 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 9 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 9 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 9 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 9 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack10_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 10 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 10 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 10 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 10 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 10 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 10 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 10 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 10 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack11_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 11 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack12_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 12 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 12 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 12 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 12 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 12 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 12 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 12 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 12 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack13_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 13 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack14_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 14 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack15_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 15 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack16_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack17_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 17 - 15 ); - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack18_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 18 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack19_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 17 ); - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 15 ); - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 19 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack20_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 20 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack21_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 19 ); - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 17 ); - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 15 ); - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 21 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack22_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 22 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack23_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 19 ); - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 15 ); - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 21 ); - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 17 ); - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 23 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack24_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 24 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack25_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 15 ); - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 19 ); - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 23 ); - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 17 ); - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 21 ); - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 25 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack26_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 26 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack27_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 17 ); - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 19 ); - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 26 ); - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 21 ); - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 23 ); - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 25 ); - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 15 ); - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 27 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack28_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 28 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack29_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 26 ); - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 23 ); - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 17 ); - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 28 ); - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 25 ); - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 19 ); - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 27 ); - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 21 ); - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 15 ); - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 29 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack30_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 28 ); - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 26 ); - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++out; - ++in; - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 28 ); - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 26 ); - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 30 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack31_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = static_cast((*in) -base) ; - ++in; - *out |= static_cast( (*in) - base ) << 31 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 30 ); - ++in; - *out |= static_cast( (*in) - base ) << 30 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 29 ); - ++in; - *out |= static_cast( (*in) - base ) << 29 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 28 ); - ++in; - *out |= static_cast( (*in) - base ) << 28 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 27 ); - ++in; - *out |= static_cast( (*in) - base ) << 27 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 26 ); - ++in; - *out |= static_cast( (*in) - base ) << 26 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 25 ); - ++in; - *out |= static_cast( (*in) - base ) << 25 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 24 ); - ++in; - *out |= static_cast( (*in) - base ) << 24 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 23 ); - ++in; - *out |= static_cast( (*in) - base ) << 23 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 22 ); - ++in; - *out |= static_cast( (*in) - base ) << 22 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 21 ); - ++in; - *out |= static_cast( (*in) - base ) << 21 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 20 ); - ++in; - *out |= static_cast( (*in) - base ) << 20 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 19 ); - ++in; - *out |= static_cast( (*in) - base ) << 19 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 18 ); - ++in; - *out |= static_cast( (*in) - base ) << 18 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 17 ); - ++in; - *out |= static_cast( (*in) - base ) << 17 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 16 ); - ++in; - *out |= static_cast( (*in) - base ) << 16 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 15 ); - ++in; - *out |= static_cast( (*in) - base ) << 15 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 14 ); - ++in; - *out |= static_cast( (*in) - base ) << 14 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 13 ); - ++in; - *out |= static_cast( (*in) - base ) << 13 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 12 ); - ++in; - *out |= static_cast( (*in) - base ) << 12 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 11 ); - ++in; - *out |= static_cast( (*in) - base ) << 11 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 10 ); - ++in; - *out |= static_cast( (*in) - base ) << 10 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 9 ); - ++in; - *out |= static_cast( (*in) - base ) << 9 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 8 ); - ++in; - *out |= static_cast( (*in) - base ) << 8 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 7 ); - ++in; - *out |= static_cast( (*in) - base ) << 7 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 6 ); - ++in; - *out |= static_cast( (*in) - base ) << 6 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 5 ); - ++in; - *out |= static_cast( (*in) - base ) << 5 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 4 ); - ++in; - *out |= static_cast( (*in) - base ) << 4 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 3 ); - ++in; - *out |= static_cast( (*in) - base ) << 3 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 2 ); - ++in; - *out |= static_cast( (*in) - base ) << 2 ; - ++out; - *out = static_cast( (*in) - base ) >> ( 31 - 1 ); - ++in; - *out |= static_cast( (*in) - base ) << 1 ; - ++out; - ++in; - - return out; -} - - - -static uint32_t * pack32_32( uint32_t , const uint32_t * in, uint32_t * out) { - memcpy(out,in,32*sizeof(uint32_t)); - return out + 32; -} - - - - -static const uint32_t * unpack1_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 1 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 2 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 3 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 4 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 5 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 6 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 7 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 8 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 9 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 10 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 11 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 12 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 13 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 14 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 15 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 16 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 17 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 18 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 19 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 20 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 21 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 22 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 23 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 24 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 25 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 26 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 27 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 28 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 29 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 30 ) & 1 ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack2_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) % (1U << 2 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack3_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 15 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 13 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 17 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) % (1U << 3 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack4_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack5_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 15 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 13 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 5 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 17 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) % (1U << 5 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack6_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 6 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack7_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 17 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 13 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 15 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 7 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 7 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack8_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack9_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 13 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 17 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 15 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 9 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 9 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 9 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack10_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 10 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack11_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 13 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 15 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 17 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 11 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 11 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 11 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack12_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack13_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 13 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 15 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 17 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 13 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 13 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 13 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 13 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack14_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 14 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack15_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 15 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 15 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 15 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 15 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 15 ) ; - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack16_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack17_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 17 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 17 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 17 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) % (1U << 17 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 17 - 15 ); - *out += base; - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack18_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 18 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack19_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); - *out += base; - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 19 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 19 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 19 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 19 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 19 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack20_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack21_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); - *out += base; - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 21 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 21 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 21 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 21 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 21 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 21 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack22_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 22 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack23_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); - *out += base; - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 23 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 23 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 23 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 23 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 23 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 23 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack24_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack25_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 25 ) ; - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 25 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); - *out += base; - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 25 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) % (1U << 25 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 25 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) % (1U << 25 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 25 - 17 ); - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 25 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 25 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 25 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 25 - 21 ); - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 25 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 25 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack26_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 26 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 26 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 26 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 26 ) ; - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 26 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 26 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack27_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 27 ) ; - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 27 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) % (1U << 27 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 27 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 27 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) % (1U << 27 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 27 - 25 ); - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 27 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 27 - 15 ); - *out += base; - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 27 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 27 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack28_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack29_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 29 ) ; - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) % (1U << 29 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) % (1U << 29 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 29 - 21 ); - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 29 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 29 - 15 ); - *out += base; - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 29 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 29 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 29 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 29 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack30_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 30 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) ; - ++in; - *out += base; - out++; - *out = ( (*in) >> 0 ) % (1U << 30 ) ; - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack31_32( uint32_t base, const uint32_t * in, uint32_t * out) { - - *out = ( (*in) >> 0 ) % (1U << 31 ) ; - *out += base; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); - *out += base; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); - *out += base; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); - *out += base; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); - *out += base; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); - *out += base; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); - *out += base; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); - *out += base; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); - *out += base; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); - *out += base; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); - *out += base; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); - *out += base; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); - *out += base; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); - *out += base; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); - *out += base; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); - *out += base; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); - *out += base; - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); - *out += base; - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); - *out += base; - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); - *out += base; - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); - *out += base; - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); - *out += base; - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); - *out += base; - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); - *out += base; - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 31 - 7 ); - *out += base; - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 31 - 6 ); - *out += base; - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 31 - 5 ); - *out += base; - out++; - *out = ( (*in) >> 5 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 31 - 4 ); - *out += base; - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 31 - 3 ); - *out += base; - out++; - *out = ( (*in) >> 3 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 31 - 2 ); - *out += base; - out++; - *out = ( (*in) >> 2 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 31 - 1 ); - *out += base; - out++; - *out = ( (*in) >> 1 ) ; - ++in; - *out += base; - out++; - - return in; -} - - - - -static const uint32_t * unpack32_32( uint32_t , const uint32_t * in, uint32_t * out) { - memcpy(out,in,32*sizeof(uint32_t)); - return in + 32; -} - -typedef const uint32_t * (*unpackfnc)( uint32_t, const uint32_t * , uint32_t * ); - -typedef uint32_t * (*packfnc)( uint32_t, const uint32_t * , uint32_t * ); - -static uint32_t * nullpacker( uint32_t, const uint32_t * , uint32_t * out) { - return out; -} - -static const uint32_t * nullunpacker( uint32_t base, const uint32_t * in , uint32_t * out) { - for(int k = 0; k < 32; ++k) { - out[k] = base; - } - return in; -} - - -static unpackfnc unpack32[33]= {nullunpacker,unpack1_32, - unpack2_32, - unpack3_32, - unpack4_32, - unpack5_32, - unpack6_32, - unpack7_32, - unpack8_32, - unpack9_32, - unpack10_32, - unpack11_32, - unpack12_32, - unpack13_32, - unpack14_32, - unpack15_32, - unpack16_32, - unpack17_32, - unpack18_32, - unpack19_32, - unpack20_32, - unpack21_32, - unpack22_32, - unpack23_32, - unpack24_32, - unpack25_32, - unpack26_32, - unpack27_32, - unpack28_32, - unpack29_32, - unpack30_32, - unpack31_32, - unpack32_32 - }; - -static packfnc pack32[33]= {nullpacker,pack1_32, - pack2_32, - pack3_32, - pack4_32, - pack5_32, - pack6_32, - pack7_32, - pack8_32, - pack9_32, - pack10_32, - pack11_32, - pack12_32, - pack13_32, - pack14_32, - pack15_32, - pack16_32, - pack17_32, - pack18_32, - pack19_32, - pack20_32, - pack21_32, - pack22_32, - pack23_32, - pack24_32, - pack25_32, - pack26_32, - pack27_32, - pack28_32, - pack29_32, - pack30_32, - pack31_32, - pack32_32 - }; - -static uint32_t bits(const uint32_t v) { - return v == 0 ? 0 : 32 - __builtin_clz(v); -} - -uint32_t * SIMDCompressionLib::FrameOfReference::compress_length(const uint32_t * in, uint32_t length, uint32_t * out) { - if(length == 0) return out; - uint32_t m = in[0]; - uint32_t M = in[0]; - for(uint32_t i = 1; i < length; ++i) { - if(in[i]>M) M=in[i]; - if(in[i](M-m)); - - out[0] = m; - ++out; - out[0] = M; - ++out; - uint32_t k = 0; - for(; k+32<=length; k+=32,in+=32) { - out = pack32[b](m,in,out); - } - // we could pack the rest, but we don't bother - for(; k(M-m)); - for(uint32_t k = 0; k(M-m)); - if ((value < m) || (value > M)) {// should be unlikely - uint32_t newm = m; - uint32_t newM = M; - - if (value < m) { - newm = value; - } - if (value > M) { - newM = value; - } - int newb = bits(static_cast(newM - newm)); - if ((newb != b) || (value < m) ) { // unlucky path - vector < uint32_t > buffer(nvalue + 1); - uncompress_length(in, buffer.data(), nvalue); - buffer[nvalue] = value; - uint32_t * newin = compress_length(buffer.data(), nvalue + 1, in); - return (newin - in + 1) * sizeof(uint32_t); - } else {// only max changed - in[1] = newM; - M = newM;// probably unnecessary - } - } - // add one - in += 2; - uint32_t headersize = 3; - uint32_t index = nvalue; - if(b == 32) { - in[index] = value; - return ( index + 1 + headersize ) * sizeof(uint32_t); - } - uint32_t oldpackedlength = nvalue / 32 * 32; - uint32_t newpackedlength = ( nvalue + 1 ) / 32 * 32; - uint32_t packedsizeinwords = oldpackedlength * b / 32; - in[packedsizeinwords + index - oldpackedlength] = value; - if(oldpackedlength == newpackedlength) { - return (headersize + (packedsizeinwords + index - oldpackedlength)) * sizeof(uint32_t); - } - // we now have exactly 32 to pack - uint32_t buffer[32]; - memcpy(buffer,in + packedsizeinwords, 32 * sizeof (uint32_t)); // copy to a safe buffer - uint32_t * out = pack32[b](m,buffer,in + packedsizeinwords); - return ( out - in + headersize ) * sizeof(uint32_t); -} - - - - - -struct selectmetadata { - uint32_t m; - const uint32_t *in; - uint32_t b; - uint32_t length; -}; - -static uint32_t fastselect(selectmetadata * s, size_t index) { - if(s->b == 32) { - return s->in[index]; - } - uint32_t packedlength = s->length / 32 * 32; - if(index > packedlength) { - uint32_t packedsizeinwords = packedlength * s->b / 32; - return s->in[packedsizeinwords + index - packedlength]; - } - const int bitoffset = static_cast(index) * s->b; /* how many bits */ - const int firstword = bitoffset / 32; - const int secondword = (bitoffset + s->b - 1) / 32; - const uint32_t firstpart = s->in[firstword] - >> (bitoffset % 32); - const uint32_t mask = (1 << s->b) - 1; - if (firstword == secondword) { - /* easy common case*/ - return s->m + (firstpart & mask); - } else { - /* harder case where we need to combine two words */ - const uint32_t secondpart = s->in[firstword + 1]; - const int usablebitsinfirstword = 32 - (bitoffset % 32); - return s->m - + ((firstpart | (secondpart << usablebitsinfirstword)) - & mask); - } -} - -// Performs a lower bound find in the encoded array. -// Returns the index -size_t findlowerbound(const uint32_t *in, uint32_t key, - uint32_t *presult) { - selectmetadata s; - s.length = *in; - in ++; - s.m = *in; - ++in; - uint32_t M = *in; - ++in; - s.b = bits(M-s.m); - s.in = in; - - int count = s.length; - int begin = 0; - uint32_t val; - while (count > 0) { - int step = count / 2; - val = fastselect(&s, begin+step); - if (val < key) { - begin += step + 1; - count -= step + 1; - } else count = step; - } - *presult = fastselect(&s, begin); - return begin; -} - - - - - -/*** - * BEGIN OF linear search - */ - - - -static int search1_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 1 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 2 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 3 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 4 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 5 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 6 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 7 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 8 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 9 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 10 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 11 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 12 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 13 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 14 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 15 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 17 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 18 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 19 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 20 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 21 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 22 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 23 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 24 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 25 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 26 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 27 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 28 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 29 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 30 ) & 1 ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search2_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 2 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 4 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 6 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 8 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 10 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 12 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 14 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 16 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 18 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 20 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 22 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 24 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 26 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 28 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 2 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 4 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 6 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 8 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 10 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 12 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 14 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 16 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 18 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 20 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 22 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 24 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 26 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 28 ) % (1U << 2 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search3_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 3 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 6 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 9 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 12 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 15 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 18 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 21 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 24 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 27 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 1 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 4 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 7 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 10 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 13 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 19 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 22 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 25 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 28 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 2 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 5 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 8 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 11 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 14 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 17 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 20 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 23 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 26 ) % (1U << 3 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search4_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 4 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 8 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 12 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 16 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 20 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 24 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 0 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 4 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 8 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 12 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 16 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 20 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 24 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 4 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 8 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 12 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 16 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 20 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 24 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 0 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 4 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 8 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 12 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 16 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 20 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 24 ) % (1U << 4 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search5_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 5 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 10 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 15 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 20 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 25 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 3 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 8 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 13 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 18 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 23 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 1 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 6 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 11 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 21 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 26 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 4 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 9 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 14 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 19 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 24 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 5 - 2 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 2 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 7 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 12 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 17 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 22 ) % (1U << 5 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search6_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 6 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 12 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 18 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 24 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 4 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 10 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 16 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 22 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 2 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 8 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 14 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 20 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 6 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 12 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 18 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 24 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 4 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 10 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 16 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 22 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 2 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 8 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 14 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 20 ) % (1U << 6 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search7_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 7 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 14 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 21 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 3 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 10 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 17 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 24 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 6 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 13 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 20 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 2 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 9 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 23 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 5 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 12 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 19 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 1 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 8 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 15 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 22 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 7 - 4 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 4 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 11 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 18 ) % (1U << 7 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search8_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 8 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 16 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 0 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 8 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 16 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 0 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 8 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 16 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 0 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 8 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 16 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 8 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 16 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 0 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 8 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 16 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 0 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 8 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 16 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 0 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 8 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 16 ) % (1U << 8 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search9_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 9 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 18 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 4 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 13 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 22 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 8 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 17 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 3 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 12 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 21 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 7 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 2 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 11 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 20 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 6 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 15 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 9 - 1 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 1 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 10 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 19 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 9 - 5 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 5 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 14 ) % (1U << 9 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search10_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 10 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 20 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 8 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 18 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 6 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 16 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 4 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 14 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 2 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 12 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 10 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 20 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 8 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 18 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 6 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 16 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 4 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 14 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 2 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 12 ) % (1U << 10 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search11_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 11 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 1 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 12 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 2 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 13 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 3 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 14 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 4 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 15 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 5 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 6 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 17 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 7 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 18 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 8 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 19 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 11 - 9 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 9 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 20 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 11 - 10 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 10 ) % (1U << 11 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search12_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 12 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 4 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 16 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 8 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 0 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 12 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 4 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 16 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 8 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 12 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 4 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 16 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 8 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 0 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 12 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 4 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 16 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 8 ) % (1U << 12 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search13_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 13 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 7 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 1 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 14 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 8 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 2 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 15 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 9 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 3 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 10 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 4 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 17 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 11 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 13 - 5 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 5 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 18 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 13 - 12 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 12 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 13 - 6 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 6 ) % (1U << 13 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search14_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 14 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 10 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 6 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 2 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 16 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 12 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 8 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 4 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 14 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 10 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 6 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 2 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 16 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 12 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 8 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 4 ) % (1U << 14 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search15_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 15 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 13 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 11 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 9 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 7 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 5 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 3 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 1 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 14 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 12 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 10 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 8 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 15 - 6 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 6 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 15 - 4 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 4 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 15 - 2 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 2 ) % (1U << 15 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search16_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 0 ) % (1U << 16 ) ; - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search17_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 2 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 4 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 6 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 8 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 10 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 12 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 14 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 1 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 3 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 5 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 7 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 17 - 9 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 9 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 17 - 11 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 11 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 17 - 13 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 13 ) % (1U << 17 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 15 ))<<( 17 - 15 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 15 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search18_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 4 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 8 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 12 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 2 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 6 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 10 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 4 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 8 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 12 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 2 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 6 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 10 ) % (1U << 18 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search19_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 6 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 12 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 5 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 11 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 4 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 10 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 3 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 9 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 15 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 2 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 8 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 19 - 14 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 19 - 1 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 1 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 19 - 7 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 7 ) % (1U << 19 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 19 - 13 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 13 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search20_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 8 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 4 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 0 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 8 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 4 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 8 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 4 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 0 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 8 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 4 ) % (1U << 20 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search21_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 10 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 9 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 8 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 7 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 6 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 5 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 15 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 4 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 3 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 21 - 13 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 13 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 21 - 2 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 2 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 21 - 12 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 21 - 1 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 1 ) % (1U << 21 ) ; - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 21 - 11 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 11 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search22_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 2 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 4 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 6 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 8 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 2 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 4 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 6 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 8 ) % (1U << 22 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search23_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 5 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 1 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 15 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 6 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 11 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 2 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 7 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 3 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 8 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 23 - 22 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 23 - 13 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 13 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 23 - 4 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 4 ) % (1U << 23 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 23 - 18 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 23 - 9 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 9 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search24_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 24 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 0 ) % (1U << 24 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 0 ) % (1U << 24 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 0 ) % (1U << 24 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 24 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 0 ) % (1U << 24 ) ; - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 0 ) % (1U << 24 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 0 ) % (1U << 24 ) ; - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search25_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 25 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 11 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 4 ) % (1U << 25 ) ; - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 15 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 1 ) % (1U << 25 ) ; - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 5 ) % (1U << 25 ) ; - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 9 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 2 ) % (1U << 25 ) ; - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 13 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 6 ) % (1U << 25 ) ; - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 17 ))<<( 25 - 17 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 25 - 10 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 25 - 3 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 3 ) % (1U << 25 ) ; - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 21 ))<<( 25 - 21 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 25 - 14 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 25 - 7 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 7 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search26_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 26 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 2 ) % (1U << 26 ) ; - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 4 ) % (1U << 26 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 6 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 26 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 2 ) % (1U << 26 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 4 ) % (1U << 26 ) ; - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 6 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search27_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 27 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 7 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 2 ) % (1U << 27 ) ; - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 9 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 4 ) % (1U << 27 ) ; - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 11 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 6 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 1 ) % (1U << 27 ) ; - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 13 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 27 - 3 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 3 ) % (1U << 27 ) ; - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 25 ))<<( 27 - 25 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 27 - 20 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 15 ))<<( 27 - 15 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 15 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 27 - 10 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 27 - 5 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 5 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search28_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 28 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 4 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 0 ) % (1U << 28 ) ; - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 4 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 28 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 4 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 0 ) % (1U << 28 ) ; - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 4 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search29_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 29 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 11 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 5 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 2 ) % (1U << 29 ) ; - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 13 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 7 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 4 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 1 ) % (1U << 29 ) ; - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 21 ))<<( 29 - 21 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 29 - 18 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 15 ))<<( 29 - 15 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 15 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 29 - 12 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 29 - 9 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 9 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 29 - 6 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 6 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 29 - 3 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 3 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search30_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 30 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 6 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 4 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 2 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 0 ) % (1U << 30 ) ; - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 6 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 4 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 2 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search31_32(uint32_t base, const uint32_t * in, uint32_t target, uint32_t * decoded) { - - *decoded = ( (*in) >> 0 ) % (1U << 31 ) ; - *decoded += base; - if(*decoded >= target) return 0 ; - *decoded = ( (*in) >> 31 ) ; - ++in; - *decoded |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); - *decoded += base; - if(*decoded >= target) return 1 ; - *decoded = ( (*in) >> 30 ) ; - ++in; - *decoded |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); - *decoded += base; - if(*decoded >= target) return 2 ; - *decoded = ( (*in) >> 29 ) ; - ++in; - *decoded |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); - *decoded += base; - if(*decoded >= target) return 3 ; - *decoded = ( (*in) >> 28 ) ; - ++in; - *decoded |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); - *decoded += base; - if(*decoded >= target) return 4 ; - *decoded = ( (*in) >> 27 ) ; - ++in; - *decoded |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); - *decoded += base; - if(*decoded >= target) return 5 ; - *decoded = ( (*in) >> 26 ) ; - ++in; - *decoded |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); - *decoded += base; - if(*decoded >= target) return 6 ; - *decoded = ( (*in) >> 25 ) ; - ++in; - *decoded |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); - *decoded += base; - if(*decoded >= target) return 7 ; - *decoded = ( (*in) >> 24 ) ; - ++in; - *decoded |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); - *decoded += base; - if(*decoded >= target) return 8 ; - *decoded = ( (*in) >> 23 ) ; - ++in; - *decoded |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); - *decoded += base; - if(*decoded >= target) return 9 ; - *decoded = ( (*in) >> 22 ) ; - ++in; - *decoded |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); - *decoded += base; - if(*decoded >= target) return 10 ; - *decoded = ( (*in) >> 21 ) ; - ++in; - *decoded |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); - *decoded += base; - if(*decoded >= target) return 11 ; - *decoded = ( (*in) >> 20 ) ; - ++in; - *decoded |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); - *decoded += base; - if(*decoded >= target) return 12 ; - *decoded = ( (*in) >> 19 ) ; - ++in; - *decoded |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); - *decoded += base; - if(*decoded >= target) return 13 ; - *decoded = ( (*in) >> 18 ) ; - ++in; - *decoded |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); - *decoded += base; - if(*decoded >= target) return 14 ; - *decoded = ( (*in) >> 17 ) ; - ++in; - *decoded |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); - *decoded += base; - if(*decoded >= target) return 15 ; - *decoded = ( (*in) >> 16 ) ; - ++in; - *decoded |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); - *decoded += base; - if(*decoded >= target) return 16 ; - *decoded = ( (*in) >> 15 ) ; - ++in; - *decoded |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); - *decoded += base; - if(*decoded >= target) return 17 ; - *decoded = ( (*in) >> 14 ) ; - ++in; - *decoded |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); - *decoded += base; - if(*decoded >= target) return 18 ; - *decoded = ( (*in) >> 13 ) ; - ++in; - *decoded |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); - *decoded += base; - if(*decoded >= target) return 19 ; - *decoded = ( (*in) >> 12 ) ; - ++in; - *decoded |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); - *decoded += base; - if(*decoded >= target) return 20 ; - *decoded = ( (*in) >> 11 ) ; - ++in; - *decoded |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); - *decoded += base; - if(*decoded >= target) return 21 ; - *decoded = ( (*in) >> 10 ) ; - ++in; - *decoded |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); - *decoded += base; - if(*decoded >= target) return 22 ; - *decoded = ( (*in) >> 9 ) ; - ++in; - *decoded |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); - *decoded += base; - if(*decoded >= target) return 23 ; - *decoded = ( (*in) >> 8 ) ; - ++in; - *decoded |= ((*in) % (1U<< 7 ))<<( 31 - 7 ); - *decoded += base; - if(*decoded >= target) return 24 ; - *decoded = ( (*in) >> 7 ) ; - ++in; - *decoded |= ((*in) % (1U<< 6 ))<<( 31 - 6 ); - *decoded += base; - if(*decoded >= target) return 25 ; - *decoded = ( (*in) >> 6 ) ; - ++in; - *decoded |= ((*in) % (1U<< 5 ))<<( 31 - 5 ); - *decoded += base; - if(*decoded >= target) return 26 ; - *decoded = ( (*in) >> 5 ) ; - ++in; - *decoded |= ((*in) % (1U<< 4 ))<<( 31 - 4 ); - *decoded += base; - if(*decoded >= target) return 27 ; - *decoded = ( (*in) >> 4 ) ; - ++in; - *decoded |= ((*in) % (1U<< 3 ))<<( 31 - 3 ); - *decoded += base; - if(*decoded >= target) return 28 ; - *decoded = ( (*in) >> 3 ) ; - ++in; - *decoded |= ((*in) % (1U<< 2 ))<<( 31 - 2 ); - *decoded += base; - if(*decoded >= target) return 29 ; - *decoded = ( (*in) >> 2 ) ; - ++in; - *decoded |= ((*in) % (1U<< 1 ))<<( 31 - 1 ); - *decoded += base; - if(*decoded >= target) return 30 ; - *decoded = ( (*in) >> 1 ) ; - ++in; - *decoded += base; - if(*decoded >= target) return 31 ; - return -1; - - } - - - - -static int search32_32(uint32_t , const uint32_t * in, uint32_t target, uint32_t * decoded) { - for(int k = 0 ; k < 32; ++k) { - if(in[k] >= target) { - *decoded = in[k]; - return k; - } - } - return -1; - -} - - -static int searchlinearfor(uint32_t base, const uint32_t * in, uint32_t bit, - uint32_t target, uint32_t *presult) -{ - switch (bit) { - case 0: if(target < base) return -1; else {*presult = base; return 0;} - -case 1: return search1_32(base, in, target, presult); - -case 2: return search2_32(base, in, target, presult); - -case 3: return search3_32(base, in, target, presult); - -case 4: return search4_32(base, in, target, presult); - -case 5: return search5_32(base, in, target, presult); - -case 6: return search6_32(base, in, target, presult); - -case 7: return search7_32(base, in, target, presult); - -case 8: return search8_32(base, in, target, presult); - -case 9: return search9_32(base, in, target, presult); - -case 10: return search10_32(base, in, target, presult); - -case 11: return search11_32(base, in, target, presult); - -case 12: return search12_32(base, in, target, presult); - -case 13: return search13_32(base, in, target, presult); - -case 14: return search14_32(base, in, target, presult); - -case 15: return search15_32(base, in, target, presult); - -case 16: return search16_32(base, in, target, presult); - -case 17: return search17_32(base, in, target, presult); - -case 18: return search18_32(base, in, target, presult); - -case 19: return search19_32(base, in, target, presult); - -case 20: return search20_32(base, in, target, presult); - -case 21: return search21_32(base, in, target, presult); - -case 22: return search22_32(base, in, target, presult); - -case 23: return search23_32(base, in, target, presult); - -case 24: return search24_32(base, in, target, presult); - -case 25: return search25_32(base, in, target, presult); - -case 26: return search26_32(base, in, target, presult); - -case 27: return search27_32(base, in, target, presult); - -case 28: return search28_32(base, in, target, presult); - -case 29: return search29_32(base, in, target, presult); - -case 30: return search30_32(base, in, target, presult); - -case 31: return search31_32(base, in, target, presult); - -case 32: return search32_32(base, in, target, presult); - - default: break; - } - return (-1); -} - - -/*** - * END OF linear search - */ - - - -size_t findlower_linear(const uint32_t *in, uint32_t key, -uint32_t *presult) { - selectmetadata s; - s.length = *in; - in ++; - s.m = *in; - ++in; - uint32_t M = *in; - ++in; - s.b = bits(M-s.m); - s.in = in; - - - for(uint32_t k = 0, offset = 0; k < s.length; k += 32, offset += s.b) { - if(32 + k > s.length) { - for(uint32_t z = 0; z< s.length-k;++z) { - if(*(s.in + offset + z) >= key) { - *presult = *(s.in + offset + z); - return k + z; - } - } - return -1; - } - - int x = searchlinearfor(s.m, s.in + offset, s.b,key,presult); - if (x >= 0) { - return k + x; - } - } - return -1; -} - - - -// Performs a lower bound find in the encoded array. -// Returns the index -size_t SIMDCompressionLib::FrameOfReference::findLowerBound(const uint32_t *in, const size_t , uint32_t key, - uint32_t *presult) { - const bool USES_BINARY = true; - if(USES_BINARY) { - return findlowerbound(in, key,presult); - } else { - return findlower_linear(in, key,presult); - - } -} - - -// Returns a decompressed value in an encoded array -uint32_t SIMDCompressionLib::FrameOfReference::select(const uint32_t *in, size_t index) { - uint32_t length = *in; - in ++; - uint32_t m = *in; - ++in; - uint32_t M = *in; - ++in; - uint32_t b = bits(M-m); - if(b == 32) { - return in[index]; - } - uint32_t packedlength = length / 32 * 32; - if(index > packedlength) { - uint32_t packedsizeinwords = packedlength * b / 32; - return in[packedsizeinwords + index - packedlength]; - } - const int bitoffset = static_cast(index) * b; /* how many bits */ - const int firstword = bitoffset / 32; - const int secondword = (bitoffset + b - 1) / 32; - const uint32_t firstpart = in[firstword] - >> (bitoffset % 32); - const uint32_t mask = (1 << b) - 1; - if (firstword == secondword) { - /* easy common case*/ - return m + (firstpart & mask); - } else { - /* harder case where we need to combine two words */ - const uint32_t secondpart = in[firstword + 1]; - const int usablebitsinfirstword = 32 - (bitoffset % 32); - return m - + ((firstpart | (secondpart << usablebitsinfirstword)) - & mask); - } -} - - - -/******************************** - * SIMD VERSION FOLLOWS - ******************************** - ******************************** - ********************************/ - - - - - -static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { - __m128i *out = (__m128i*)(_out); - int i; - (void) _in; - for (i = 0; i < 8; ++i) { - _mm_storeu_si128(out++, initOffset); - _mm_storeu_si128(out++, initOffset); - _mm_storeu_si128(out++, initOffset); - _mm_storeu_si128(out++, initOffset); - } - - return initOffset; -} - - - - -static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { - (void) initOffset; - (void) _in; - (void) out; -} - - -static void ipackFOR1(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR2(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR3(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR4(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR5(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR6(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR7(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR8(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR9(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR10(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR11(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR12(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR13(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR14(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR15(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR16(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR17(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR18(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR19(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR20(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR21(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR22(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR23(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR24(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR25(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR26(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR27(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR28(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR29(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR30(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR31(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_sub_epi32(CurrIn, initOffset); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void ipackFOR32(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - __m128i InReg = _mm_loadu_si128(in); - (void) initOffset; - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - -} - - - - -static __m128i iunpackFOR1(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<1)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR2(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<2)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR3(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<3)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR4(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<4)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR5(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<5)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR6(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<6)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR7(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<7)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR8(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<8)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR9(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<9)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR10(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<10)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR11(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<11)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR12(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<12)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR13(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<13)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR14(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<14)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR15(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<15)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR16(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<16)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR17(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<17)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR18(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<18)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -static __m128i iunpackFOR19(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<19)-1); - + } + if (bit == 32) { + memcpy(out, in, length * sizeof(uint32_t)); + return (const __m128i *)((const uint32_t *)in + length); + } + offset = _mm_set1_epi32(initvalue); + maskbits = _mm_set1_epi32((1 << bit) - 1); + inwordpointer = 0; + P = _mm_loadu_si128((__m128i *)in); + ++in; + for (k = 0; k < length / 4; ++k) { + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)out, _mm_add_epi32(answer, offset)); + out += 4; + } + if (length % 4 != 0) { + uint32_t buffer[4]; + __m128i answer = _mm_srli_epi32(P, inwordpointer); + const uint32_t firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if (bit < firstpass) { + inwordpointer += bit; + } else { + P = _mm_loadu_si128((__m128i *)in); + ++in; + answer = _mm_or_si128(_mm_slli_epi32(P, firstpass), answer); + inwordpointer = bit - firstpass; + } + answer = _mm_and_si128(maskbits, answer); + _mm_storeu_si128((__m128i *)buffer, _mm_add_epi32(answer, offset)); + for (k = 0; k < (length % 4); ++k) { + *out = buffer[k]; + ++out; + } + } + return in; +} +static uint32_t *pack1_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 15; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 17; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 19; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 21; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 23; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 25; + ++in; + *out |= static_cast((*in) - base) << 26; + ++in; + *out |= static_cast((*in) - base) << 27; + ++in; + *out |= static_cast((*in) - base) << 28; + ++in; + *out |= static_cast((*in) - base) << 29; + ++in; + *out |= static_cast((*in) - base) << 30; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + ++in; + + return out; +} - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static uint32_t *pack2_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 26; + ++in; + *out |= static_cast((*in) - base) << 28; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 26; + ++in; + *out |= static_cast((*in) - base) << 28; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + ++in; + + return out; +} - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); +static uint32_t *pack3_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 15; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 21; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 27; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (3 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 19; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 25; + ++in; + *out |= static_cast((*in) - base) << 28; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (3 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 17; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 23; + ++in; + *out |= static_cast((*in) - base) << 26; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + ++in; + + return out; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static uint32_t *pack4_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + ++in; + + return out; +} - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static uint32_t *pack5_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 15; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 25; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (5 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 23; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (5 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 21; + ++in; + *out |= static_cast((*in) - base) << 26; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (5 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 19; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (5 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 17; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + ++in; + + return out; +} - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); +static uint32_t *pack6_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (6 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (6 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (6 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (6 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + ++in; + + return out; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static uint32_t *pack7_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 21; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (7 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 17; + ++in; + *out |= static_cast((*in) - base) << 24; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (7 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (7 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 23; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (7 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 19; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (7 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 15; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (7 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + ++in; + + return out; +} - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static uint32_t *pack8_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + ++in; + + return out; +} - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); +static uint32_t *pack9_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (9 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 22; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (9 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 17; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (9 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 21; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (9 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (9 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (9 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 15; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (9 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 19; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (9 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + ++in; + + return out; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; +static uint32_t *pack10_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (10 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (10 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (10 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (10 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (10 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (10 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (10 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (10 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + ++in; + + return out; +} +static uint32_t *pack11_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (11 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (11 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (11 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (11 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 15; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (11 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (11 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 17; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (11 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (11 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 19; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (11 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 20; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (11 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + ++in; + + return out; } +static uint32_t *pack12_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (12 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (12 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (12 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (12 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (12 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (12 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (12 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (12 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + ++in; + + return out; +} +static uint32_t *pack13_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (13 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (13 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (13 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (13 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 15; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (13 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (13 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (13 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (13 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 17; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (13 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (13 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 18; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (13 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (13 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + ++in; + + return out; +} +static uint32_t *pack14_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (14 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (14 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (14 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (14 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (14 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (14 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (14 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (14 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (14 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (14 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (14 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (14 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + ++in; + + return out; +} -static __m128i iunpackFOR20(__m128i initOffset, const __m128i* in, uint32_t * _out) { +static uint32_t *pack15_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 15; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (15 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (15 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (15 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (15 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (15 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (15 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (15 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 16; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (15 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (15 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (15 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (15 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (15 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (15 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (15 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + ++in; + + return out; +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<20)-1); +static uint32_t *pack16_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + ++in; + + return out; +} +static uint32_t *pack17_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + *out = static_cast((*in) - base) >> (17 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (17 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (17 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (17 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (17 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (17 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (17 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (17 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (17 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (17 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (17 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (17 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (17 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (17 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (17 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (17 - 15); + ++in; + *out |= static_cast((*in) - base) << 15; + ++out; + ++in; + + return out; +} +static uint32_t *pack18_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (18 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (18 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (18 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (18 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (18 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (18 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (18 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (18 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (18 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (18 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (18 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (18 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (18 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (18 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (18 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (18 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + ++in; + + return out; +} - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static uint32_t *pack19_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (19 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (19 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (19 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (19 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (19 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (19 - 17); + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + *out = static_cast((*in) - base) >> (19 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (19 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (19 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (19 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (19 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (19 - 15); + ++in; + *out |= static_cast((*in) - base) << 15; + ++out; + *out = static_cast((*in) - base) >> (19 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (19 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (19 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (19 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (19 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (19 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++out; + ++in; + + return out; +} - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); +static uint32_t *pack20_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (20 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (20 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (20 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (20 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (20 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (20 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (20 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (20 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (20 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (20 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (20 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (20 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (20 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (20 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (20 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (20 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + ++in; + + return out; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; +static uint32_t *pack21_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (21 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (21 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (21 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (21 - 19); + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (21 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (21 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (21 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (21 - 17); + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + *out = static_cast((*in) - base) >> (21 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (21 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (21 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (21 - 15); + ++in; + *out |= static_cast((*in) - base) << 15; + ++out; + *out = static_cast((*in) - base) >> (21 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (21 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (21 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (21 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++out; + *out = static_cast((*in) - base) >> (21 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (21 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (21 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (21 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++out; + ++in; + + return out; +} +static uint32_t *pack22_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (22 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (22 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (22 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (22 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (22 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (22 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (22 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (22 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (22 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (22 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (22 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (22 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (22 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (22 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (22 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (22 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (22 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (22 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (22 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (22 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + ++in; + + return out; } +static uint32_t *pack23_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (23 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (23 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (23 - 19); + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (23 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (23 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (23 - 15); + ++in; + *out |= static_cast((*in) - base) << 15; + ++out; + *out = static_cast((*in) - base) >> (23 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (23 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (23 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++out; + *out = static_cast((*in) - base) >> (23 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (23 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (23 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (23 - 21); + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (23 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (23 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (23 - 17); + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + *out = static_cast((*in) - base) >> (23 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (23 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (23 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++out; + *out = static_cast((*in) - base) >> (23 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (23 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (23 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++out; + ++in; + + return out; +} +static uint32_t *pack24_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (24 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (24 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (24 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (24 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (24 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (24 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (24 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (24 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (24 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (24 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (24 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (24 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (24 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (24 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (24 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (24 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + ++in; + + return out; +} +static uint32_t *pack25_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (25 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (25 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++out; + *out = static_cast((*in) - base) >> (25 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (25 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (25 - 15); + ++in; + *out |= static_cast((*in) - base) << 15; + ++out; + *out = static_cast((*in) - base) >> (25 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (25 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (25 - 19); + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (25 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (25 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (25 - 23); + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (25 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (25 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++out; + *out = static_cast((*in) - base) >> (25 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (25 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (25 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++out; + *out = static_cast((*in) - base) >> (25 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (25 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (25 - 17); + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + *out = static_cast((*in) - base) >> (25 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (25 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (25 - 21); + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (25 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (25 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++out; + ++in; + + return out; +} -static __m128i iunpackFOR21(__m128i initOffset, const __m128i* in, uint32_t * _out) { +static uint32_t *pack26_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (26 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (26 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (26 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (26 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (26 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (26 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (26 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (26 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (26 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (26 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (26 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (26 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (26 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (26 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (26 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (26 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (26 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (26 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (26 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (26 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (26 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (26 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (26 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (26 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++out; + ++in; + + return out; +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<21)-1); +static uint32_t *pack27_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (27 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (27 - 17); + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + *out = static_cast((*in) - base) >> (27 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (27 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++out; + *out = static_cast((*in) - base) >> (27 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (27 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (27 - 19); + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (27 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (27 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++out; + *out = static_cast((*in) - base) >> (27 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (27 - 26); + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (27 - 21); + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (27 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (27 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++out; + *out = static_cast((*in) - base) >> (27 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++out; + *out = static_cast((*in) - base) >> (27 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (27 - 23); + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (27 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (27 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++out; + *out = static_cast((*in) - base) >> (27 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (27 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (27 - 25); + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (27 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (27 - 15); + ++in; + *out |= static_cast((*in) - base) << 15; + ++out; + *out = static_cast((*in) - base) >> (27 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (27 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++out; + ++in; + + return out; +} +static uint32_t *pack28_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (28 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (28 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (28 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (28 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (28 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (28 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (28 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (28 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (28 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (28 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (28 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (28 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (28 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (28 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (28 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (28 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (28 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (28 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (28 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (28 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (28 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (28 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (28 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (28 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++out; + ++in; + + return out; +} +static uint32_t *pack29_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (29 - 26); + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (29 - 23); + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (29 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (29 - 17); + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + *out = static_cast((*in) - base) >> (29 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (29 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++out; + *out = static_cast((*in) - base) >> (29 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (29 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++out; + *out = static_cast((*in) - base) >> (29 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (29 - 28); + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (29 - 25); + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (29 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (29 - 19); + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (29 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (29 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++out; + *out = static_cast((*in) - base) >> (29 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (29 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++out; + *out = static_cast((*in) - base) >> (29 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++out; + *out = static_cast((*in) - base) >> (29 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (29 - 27); + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (29 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (29 - 21); + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (29 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (29 - 15); + ++in; + *out |= static_cast((*in) - base) << 15; + ++out; + *out = static_cast((*in) - base) >> (29 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (29 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++out; + *out = static_cast((*in) - base) >> (29 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++out; + *out = static_cast((*in) - base) >> (29 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++out; + ++in; + + return out; +} - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static uint32_t *pack30_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (30 - 28); + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (30 - 26); + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (30 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (30 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (30 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (30 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (30 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (30 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (30 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (30 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (30 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (30 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++out; + *out = static_cast((*in) - base) >> (30 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++out; + *out = static_cast((*in) - base) >> (30 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++out; + ++in; + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (30 - 28); + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (30 - 26); + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (30 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (30 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (30 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (30 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (30 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (30 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (30 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (30 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (30 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (30 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++out; + *out = static_cast((*in) - base) >> (30 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++out; + *out = static_cast((*in) - base) >> (30 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++out; + ++in; + + return out; +} - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); +static uint32_t *pack31_32(uint32_t base, const uint32_t *in, uint32_t *out) { + + *out = static_cast((*in) - base); + ++in; + *out |= static_cast((*in) - base) << 31; + ++out; + *out = static_cast((*in) - base) >> (31 - 30); + ++in; + *out |= static_cast((*in) - base) << 30; + ++out; + *out = static_cast((*in) - base) >> (31 - 29); + ++in; + *out |= static_cast((*in) - base) << 29; + ++out; + *out = static_cast((*in) - base) >> (31 - 28); + ++in; + *out |= static_cast((*in) - base) << 28; + ++out; + *out = static_cast((*in) - base) >> (31 - 27); + ++in; + *out |= static_cast((*in) - base) << 27; + ++out; + *out = static_cast((*in) - base) >> (31 - 26); + ++in; + *out |= static_cast((*in) - base) << 26; + ++out; + *out = static_cast((*in) - base) >> (31 - 25); + ++in; + *out |= static_cast((*in) - base) << 25; + ++out; + *out = static_cast((*in) - base) >> (31 - 24); + ++in; + *out |= static_cast((*in) - base) << 24; + ++out; + *out = static_cast((*in) - base) >> (31 - 23); + ++in; + *out |= static_cast((*in) - base) << 23; + ++out; + *out = static_cast((*in) - base) >> (31 - 22); + ++in; + *out |= static_cast((*in) - base) << 22; + ++out; + *out = static_cast((*in) - base) >> (31 - 21); + ++in; + *out |= static_cast((*in) - base) << 21; + ++out; + *out = static_cast((*in) - base) >> (31 - 20); + ++in; + *out |= static_cast((*in) - base) << 20; + ++out; + *out = static_cast((*in) - base) >> (31 - 19); + ++in; + *out |= static_cast((*in) - base) << 19; + ++out; + *out = static_cast((*in) - base) >> (31 - 18); + ++in; + *out |= static_cast((*in) - base) << 18; + ++out; + *out = static_cast((*in) - base) >> (31 - 17); + ++in; + *out |= static_cast((*in) - base) << 17; + ++out; + *out = static_cast((*in) - base) >> (31 - 16); + ++in; + *out |= static_cast((*in) - base) << 16; + ++out; + *out = static_cast((*in) - base) >> (31 - 15); + ++in; + *out |= static_cast((*in) - base) << 15; + ++out; + *out = static_cast((*in) - base) >> (31 - 14); + ++in; + *out |= static_cast((*in) - base) << 14; + ++out; + *out = static_cast((*in) - base) >> (31 - 13); + ++in; + *out |= static_cast((*in) - base) << 13; + ++out; + *out = static_cast((*in) - base) >> (31 - 12); + ++in; + *out |= static_cast((*in) - base) << 12; + ++out; + *out = static_cast((*in) - base) >> (31 - 11); + ++in; + *out |= static_cast((*in) - base) << 11; + ++out; + *out = static_cast((*in) - base) >> (31 - 10); + ++in; + *out |= static_cast((*in) - base) << 10; + ++out; + *out = static_cast((*in) - base) >> (31 - 9); + ++in; + *out |= static_cast((*in) - base) << 9; + ++out; + *out = static_cast((*in) - base) >> (31 - 8); + ++in; + *out |= static_cast((*in) - base) << 8; + ++out; + *out = static_cast((*in) - base) >> (31 - 7); + ++in; + *out |= static_cast((*in) - base) << 7; + ++out; + *out = static_cast((*in) - base) >> (31 - 6); + ++in; + *out |= static_cast((*in) - base) << 6; + ++out; + *out = static_cast((*in) - base) >> (31 - 5); + ++in; + *out |= static_cast((*in) - base) << 5; + ++out; + *out = static_cast((*in) - base) >> (31 - 4); + ++in; + *out |= static_cast((*in) - base) << 4; + ++out; + *out = static_cast((*in) - base) >> (31 - 3); + ++in; + *out |= static_cast((*in) - base) << 3; + ++out; + *out = static_cast((*in) - base) >> (31 - 2); + ++in; + *out |= static_cast((*in) - base) << 2; + ++out; + *out = static_cast((*in) - base) >> (31 - 1); + ++in; + *out |= static_cast((*in) - base) << 1; + ++out; + ++in; + + return out; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static uint32_t *pack32_32(uint32_t, const uint32_t *in, uint32_t *out) { + memcpy(out, in, 32 * sizeof(uint32_t)); + return out + 32; +} - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack1_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) & 1; + *out += base; + out++; + *out = ((*in) >> 1) & 1; + *out += base; + out++; + *out = ((*in) >> 2) & 1; + *out += base; + out++; + *out = ((*in) >> 3) & 1; + *out += base; + out++; + *out = ((*in) >> 4) & 1; + *out += base; + out++; + *out = ((*in) >> 5) & 1; + *out += base; + out++; + *out = ((*in) >> 6) & 1; + *out += base; + out++; + *out = ((*in) >> 7) & 1; + *out += base; + out++; + *out = ((*in) >> 8) & 1; + *out += base; + out++; + *out = ((*in) >> 9) & 1; + *out += base; + out++; + *out = ((*in) >> 10) & 1; + *out += base; + out++; + *out = ((*in) >> 11) & 1; + *out += base; + out++; + *out = ((*in) >> 12) & 1; + *out += base; + out++; + *out = ((*in) >> 13) & 1; + *out += base; + out++; + *out = ((*in) >> 14) & 1; + *out += base; + out++; + *out = ((*in) >> 15) & 1; + *out += base; + out++; + *out = ((*in) >> 16) & 1; + *out += base; + out++; + *out = ((*in) >> 17) & 1; + *out += base; + out++; + *out = ((*in) >> 18) & 1; + *out += base; + out++; + *out = ((*in) >> 19) & 1; + *out += base; + out++; + *out = ((*in) >> 20) & 1; + *out += base; + out++; + *out = ((*in) >> 21) & 1; + *out += base; + out++; + *out = ((*in) >> 22) & 1; + *out += base; + out++; + *out = ((*in) >> 23) & 1; + *out += base; + out++; + *out = ((*in) >> 24) & 1; + *out += base; + out++; + *out = ((*in) >> 25) & 1; + *out += base; + out++; + *out = ((*in) >> 26) & 1; + *out += base; + out++; + *out = ((*in) >> 27) & 1; + *out += base; + out++; + *out = ((*in) >> 28) & 1; + *out += base; + out++; + *out = ((*in) >> 29) & 1; + *out += base; + out++; + *out = ((*in) >> 30) & 1; + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out += base; + out++; + + return in; +} - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); +static const uint32_t *unpack2_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 22) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 26) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 28) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 22) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 26) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 28) % (1U << 2); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out += base; + out++; + + return in; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack3_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 15) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 21) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 27) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 1)) << (3 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 13) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 19) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 22) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 25) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 28) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 2)) << (3 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 17) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 23) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 26) % (1U << 3); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out += base; + out++; + + return in; +} - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); +static const uint32_t *unpack4_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 4); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out += base; + out++; + + return in; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack5_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 15) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 25) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 3)) << (5 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 13) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 23) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 1)) << (5 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 21) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 26) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 4)) << (5 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 19) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 2)) << (5 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 17) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 22) % (1U << 5); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out += base; + out++; + + return in; +} - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); +static const uint32_t *unpack6_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 22) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 22) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 6); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out += base; + out++; + + return in; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; +static const uint32_t *unpack7_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 21) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 3)) << (7 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 17) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 24) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 6)) << (7 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 13) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 2)) << (7 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 23) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 5)) << (7 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 19) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 1)) << (7 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 15) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 22) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 4)) << (7 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 7); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack8_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 8); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out += base; + out++; + + return in; } +static const uint32_t *unpack9_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 4)) << (9 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 13) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 22) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 8)) << (9 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 17) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 3)) << (9 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 21) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 7)) << (9 - 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 2)) << (9 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 6)) << (9 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 15) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 1)) << (9 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 19) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 5)) << (9 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 9); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack10_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 10); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack11_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 1)) << (11 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 2)) << (11 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 13) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 3)) << (11 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 4)) << (11 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 15) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 5)) << (11 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 6)) << (11 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 17) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 7)) << (11 - 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 8)) << (11 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 19) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 9)) << (11 - 9); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 20) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 10)) << (11 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 11); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out += base; + out++; + + return in; +} -static __m128i iunpackFOR22(__m128i initOffset, const __m128i* in, uint32_t * _out) { +static const uint32_t *unpack12_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 12); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out += base; + out++; + + return in; +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<22)-1); +static const uint32_t *unpack13_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 13) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 7)) << (13 - 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 1)) << (13 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 8)) << (13 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 2)) << (13 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 15) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 9)) << (13 - 9); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 3)) << (13 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 10)) << (13 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 4)) << (13 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 17) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 11)) << (13 - 11); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 5)) << (13 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 18) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 12)) << (13 - 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 6)) << (13 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 13); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack14_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 14); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack15_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 15) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 13)) << (15 - 13); + *out += base; + out++; + *out = ((*in) >> 13) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 11)) << (15 - 11); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 9)) << (15 - 9); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 7)) << (15 - 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 5)) << (15 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 3)) << (15 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 1)) << (15 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 16) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 14)) << (15 - 14); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 12)) << (15 - 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 10)) << (15 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 8)) << (15 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 6)) << (15 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 4)) << (15 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 2)) << (15 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 15); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out += base; + out++; + + return in; +} - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack16_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out += base; + out++; + + return in; +} - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); +static const uint32_t *unpack17_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 2)) << (17 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 4)) << (17 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 6)) << (17 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 8)) << (17 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 10)) << (17 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 12)) << (17 - 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 14)) << (17 - 14); + *out += base; + out++; + *out = ((*in) >> 14) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 16)) << (17 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 1)) << (17 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 3)) << (17 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 5)) << (17 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 7)) << (17 - 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 9)) << (17 - 9); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 11)) << (17 - 11); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 13)) << (17 - 13); + *out += base; + out++; + *out = ((*in) >> 13) % (1U << 17); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 15)) << (17 - 15); + *out += base; + out++; + *out = ((*in) >> 15); + ++in; + *out += base; + out++; + + return in; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack18_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 18); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out += base; + out++; + + return in; +} - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); +static const uint32_t *unpack19_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 6)) << (19 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 12)) << (19 - 12); + *out += base; + out++; + *out = ((*in) >> 12) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 18)) << (19 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 5)) << (19 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 11)) << (19 - 11); + *out += base; + out++; + *out = ((*in) >> 11) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 17)) << (19 - 17); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 4)) << (19 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 10)) << (19 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 16)) << (19 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 3)) << (19 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 9)) << (19 - 9); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 15)) << (19 - 15); + *out += base; + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 2)) << (19 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 8)) << (19 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 14)) << (19 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 1)) << (19 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 7)) << (19 - 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 19); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 13)) << (19 - 13); + *out += base; + out++; + *out = ((*in) >> 13); + ++in; + *out += base; + out++; + + return in; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack20_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 20); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out += base; + out++; + + return in; +} - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack21_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 10)) << (21 - 10); + *out += base; + out++; + *out = ((*in) >> 10) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 20)) << (21 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 9)) << (21 - 9); + *out += base; + out++; + *out = ((*in) >> 9) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 19)) << (21 - 19); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 8)) << (21 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 18)) << (21 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 7)) << (21 - 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 17)) << (21 - 17); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 6)) << (21 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 16)) << (21 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 5)) << (21 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 15)) << (21 - 15); + *out += base; + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 4)) << (21 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 14)) << (21 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 3)) << (21 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 13)) << (21 - 13); + *out += base; + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 2)) << (21 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 12)) << (21 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 1)) << (21 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 21); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 11)) << (21 - 11); + *out += base; + out++; + *out = ((*in) >> 11); + ++in; + *out += base; + out++; + + return in; +} - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack22_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 22); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out += base; + out++; + + return in; +} - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; +static const uint32_t *unpack23_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 14)) << (23 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 5)) << (23 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 19)) << (23 - 19); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 10)) << (23 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 1)) << (23 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 15)) << (23 - 15); + *out += base; + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 6)) << (23 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 20)) << (23 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 11)) << (23 - 11); + *out += base; + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 2)) << (23 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 16)) << (23 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 7)) << (23 - 7); + *out += base; + out++; + *out = ((*in) >> 7) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 21)) << (23 - 21); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 12)) << (23 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 3)) << (23 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 17)) << (23 - 17); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 8)) << (23 - 8); + *out += base; + out++; + *out = ((*in) >> 8) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 22)) << (23 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 13)) << (23 - 13); + *out += base; + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 4)) << (23 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 23); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 18)) << (23 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 9)) << (23 - 9); + *out += base; + out++; + *out = ((*in) >> 9); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack24_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out += base; + out++; + + return in; } +static const uint32_t *unpack25_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 25); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 18)) << (25 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 11)) << (25 - 11); + *out += base; + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 4)) << (25 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 25); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 22)) << (25 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 15)) << (25 - 15); + *out += base; + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 8)) << (25 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 1)) << (25 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 25); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 19)) << (25 - 19); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 12)) << (25 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 5)) << (25 - 5); + *out += base; + out++; + *out = ((*in) >> 5) % (1U << 25); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 23)) << (25 - 23); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 16)) << (25 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 9)) << (25 - 9); + *out += base; + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 2)) << (25 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 25); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 20)) << (25 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 13)) << (25 - 13); + *out += base; + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 6)) << (25 - 6); + *out += base; + out++; + *out = ((*in) >> 6) % (1U << 25); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 24)) << (25 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 17)) << (25 - 17); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 10)) << (25 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 3)) << (25 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 25); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 21)) << (25 - 21); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 14)) << (25 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 7)) << (25 - 7); + *out += base; + out++; + *out = ((*in) >> 7); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack26_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 26); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 26); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 26); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + *out += base; + out++; + *out = ((*in) >> 6); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 26); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 26); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 26); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + *out += base; + out++; + *out = ((*in) >> 6); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack27_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 27); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 22)) << (27 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 17)) << (27 - 17); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 12)) << (27 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 7)) << (27 - 7); + *out += base; + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 2)) << (27 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 27); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 24)) << (27 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 19)) << (27 - 19); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 14)) << (27 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 9)) << (27 - 9); + *out += base; + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 4)) << (27 - 4); + *out += base; + out++; + *out = ((*in) >> 4) % (1U << 27); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 26)) << (27 - 26); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 21)) << (27 - 21); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 16)) << (27 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 11)) << (27 - 11); + *out += base; + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 6)) << (27 - 6); + *out += base; + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 1)) << (27 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 27); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 23)) << (27 - 23); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 18)) << (27 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 13)) << (27 - 13); + *out += base; + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 8)) << (27 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 3)) << (27 - 3); + *out += base; + out++; + *out = ((*in) >> 3) % (1U << 27); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 25)) << (27 - 25); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 20)) << (27 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 15)) << (27 - 15); + *out += base; + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 10)) << (27 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 5)) << (27 - 5); + *out += base; + out++; + *out = ((*in) >> 5); + ++in; + *out += base; + out++; + + return in; +} -static __m128i iunpackFOR23(__m128i initOffset, const __m128i* in, uint32_t * _out) { +static const uint32_t *unpack28_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 28); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + *out += base; + out++; + *out = ((*in) >> 4); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 28); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + *out += base; + out++; + *out = ((*in) >> 4); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 28); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + *out += base; + out++; + *out = ((*in) >> 4); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 28); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + *out += base; + out++; + *out = ((*in) >> 4); + ++in; + *out += base; + out++; + + return in; +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<23)-1); +static const uint32_t *unpack29_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 29); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 26)) << (29 - 26); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 23)) << (29 - 23); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 20)) << (29 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 17)) << (29 - 17); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 14)) << (29 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 11)) << (29 - 11); + *out += base; + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 8)) << (29 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 5)) << (29 - 5); + *out += base; + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 2)) << (29 - 2); + *out += base; + out++; + *out = ((*in) >> 2) % (1U << 29); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 28)) << (29 - 28); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 25)) << (29 - 25); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 22)) << (29 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 19)) << (29 - 19); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 16)) << (29 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 13)) << (29 - 13); + *out += base; + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 10)) << (29 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 7)) << (29 - 7); + *out += base; + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 4)) << (29 - 4); + *out += base; + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 1)) << (29 - 1); + *out += base; + out++; + *out = ((*in) >> 1) % (1U << 29); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 27)) << (29 - 27); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 24)) << (29 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 21)) << (29 - 21); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 18)) << (29 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 15)) << (29 - 15); + *out += base; + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 12)) << (29 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 9)) << (29 - 9); + *out += base; + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 6)) << (29 - 6); + *out += base; + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 3)) << (29 - 3); + *out += base; + out++; + *out = ((*in) >> 3); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack30_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 30); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + *out += base; + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + *out += base; + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + *out += base; + out++; + *out = ((*in) >> 2); + ++in; + *out += base; + out++; + *out = ((*in) >> 0) % (1U << 30); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + *out += base; + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + *out += base; + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + *out += base; + out++; + *out = ((*in) >> 2); + ++in; + *out += base; + out++; + + return in; +} +static const uint32_t *unpack31_32(uint32_t base, const uint32_t *in, + uint32_t *out) { + + *out = ((*in) >> 0) % (1U << 31); + *out += base; + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 30)) << (31 - 30); + *out += base; + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 29)) << (31 - 29); + *out += base; + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 28)) << (31 - 28); + *out += base; + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 27)) << (31 - 27); + *out += base; + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 26)) << (31 - 26); + *out += base; + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 25)) << (31 - 25); + *out += base; + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 24)) << (31 - 24); + *out += base; + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 23)) << (31 - 23); + *out += base; + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 22)) << (31 - 22); + *out += base; + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 21)) << (31 - 21); + *out += base; + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 20)) << (31 - 20); + *out += base; + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 19)) << (31 - 19); + *out += base; + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 18)) << (31 - 18); + *out += base; + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 17)) << (31 - 17); + *out += base; + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 16)) << (31 - 16); + *out += base; + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 15)) << (31 - 15); + *out += base; + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 14)) << (31 - 14); + *out += base; + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 13)) << (31 - 13); + *out += base; + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 12)) << (31 - 12); + *out += base; + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 11)) << (31 - 11); + *out += base; + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 10)) << (31 - 10); + *out += base; + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 9)) << (31 - 9); + *out += base; + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 8)) << (31 - 8); + *out += base; + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 7)) << (31 - 7); + *out += base; + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 6)) << (31 - 6); + *out += base; + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 5)) << (31 - 5); + *out += base; + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 4)) << (31 - 4); + *out += base; + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 3)) << (31 - 3); + *out += base; + out++; + *out = ((*in) >> 3); + ++in; + *out |= ((*in) % (1U << 2)) << (31 - 2); + *out += base; + out++; + *out = ((*in) >> 2); + ++in; + *out |= ((*in) % (1U << 1)) << (31 - 1); + *out += base; + out++; + *out = ((*in) >> 1); + ++in; + *out += base; + out++; + + return in; +} - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *unpack32_32(uint32_t, const uint32_t *in, + uint32_t *out) { + memcpy(out, in, 32 * sizeof(uint32_t)); + return in + 32; +} - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); +typedef const uint32_t *(*unpackfnc)(uint32_t, const uint32_t *, uint32_t *); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +typedef uint32_t *(*packfnc)(uint32_t, const uint32_t *, uint32_t *); - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); +static uint32_t *nullpacker(uint32_t, const uint32_t *, uint32_t *out) { + return out; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static const uint32_t *nullunpacker(uint32_t base, const uint32_t *in, + uint32_t *out) { + for (int k = 0; k < 32; ++k) { + out[k] = base; + } + return in; +} - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static unpackfnc unpack32[33] = { + nullunpacker, unpack1_32, unpack2_32, unpack3_32, unpack4_32, + unpack5_32, unpack6_32, unpack7_32, unpack8_32, unpack9_32, + unpack10_32, unpack11_32, unpack12_32, unpack13_32, unpack14_32, + unpack15_32, unpack16_32, unpack17_32, unpack18_32, unpack19_32, + unpack20_32, unpack21_32, unpack22_32, unpack23_32, unpack24_32, + unpack25_32, unpack26_32, unpack27_32, unpack28_32, unpack29_32, + unpack30_32, unpack31_32, unpack32_32}; + +static packfnc pack32[33] = { + nullpacker, pack1_32, pack2_32, pack3_32, pack4_32, pack5_32, + pack6_32, pack7_32, pack8_32, pack9_32, pack10_32, pack11_32, + pack12_32, pack13_32, pack14_32, pack15_32, pack16_32, pack17_32, + pack18_32, pack19_32, pack20_32, pack21_32, pack22_32, pack23_32, + pack24_32, pack25_32, pack26_32, pack27_32, pack28_32, pack29_32, + pack30_32, pack31_32, pack32_32}; - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); +static uint32_t bits(const uint32_t v) { + return v == 0 ? 0 : 32 - __builtin_clz(v); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +uint32_t *SIMDCompressionLib::FrameOfReference::compress_length( + const uint32_t *in, uint32_t length, uint32_t *out) { + if (length == 0) + return out; + uint32_t m = in[0]; + uint32_t M = in[0]; + for (uint32_t i = 1; i < length; ++i) { + if (in[i] > M) + M = in[i]; + if (in[i] < m) + m = in[i]; + } + int b = bits(static_cast(M - m)); + + out[0] = m; + ++out; + out[0] = M; + ++out; + uint32_t k = 0; + for (; k + 32 <= length; k += 32, in += 32) { + out = pack32[b](m, in, out); + } + // we could pack the rest, but we don't bother + for (; k < length; ++k, in++, out++) { + out[0] = in[0]; + } + return out; +} - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; +const uint32_t *SIMDCompressionLib::FrameOfReference::uncompress_length( + const uint32_t *in, uint32_t *out, uint32_t nvalue) { + if (nvalue == 0) + return out; + uint32_t m = in[0]; + ++in; + uint32_t M = in[0]; + ++in; + int b = bits(static_cast(M - m)); + for (uint32_t k = 0; k < nvalue / 32; ++k) { + unpack32[b](m, in + b * k, out + 32 * k); + } + out = out + nvalue / 32 * 32; + in = in + nvalue / 32 * b; + // we could pack the rest, but we don't bother + for (uint32_t k = nvalue / 32 * 32; k < nvalue; ++k, in++, out++) { + out[0] = in[0]; + } + return in; +} +size_t SIMDCompressionLib::FrameOfReference::append( + uint8_t *inbyte, const size_t currentcompressedsizeinbytes, + uint32_t value) { + if (currentcompressedsizeinbytes == 0) { + size_t nvalue = 16; // 16 is arbitrary + encodeArray(&value, 1, (uint32_t *)inbyte, nvalue); + return nvalue * sizeof(uint32_t); + } + uint32_t *in = (uint32_t *)inbyte; + + uint32_t nvalue = *in; + in[0] += 1; // incrementing! + in++; + + uint32_t m = in[0]; + uint32_t M = in[1]; + + int b = bits(static_cast(M - m)); + if ((value < m) || (value > M)) { // should be unlikely + uint32_t newm = m; + uint32_t newM = M; + + if (value < m) { + newm = value; + } + if (value > M) { + newM = value; + } + int newb = bits(static_cast(newM - newm)); + if ((newb != b) || (value < m)) { // unlucky path + vector buffer(nvalue + 1); + uncompress_length(in, buffer.data(), nvalue); + buffer[nvalue] = value; + uint32_t *newin = compress_length(buffer.data(), nvalue + 1, in); + return (newin - in + 1) * sizeof(uint32_t); + } else { // only max changed + in[1] = newM; + M = newM; // probably unnecessary + } + } + // add one + in += 2; + uint32_t headersize = 3; + uint32_t index = nvalue; + if (b == 32) { + in[index] = value; + return (index + 1 + headersize) * sizeof(uint32_t); + } + uint32_t oldpackedlength = nvalue / 32 * 32; + uint32_t newpackedlength = (nvalue + 1) / 32 * 32; + uint32_t packedsizeinwords = oldpackedlength * b / 32; + in[packedsizeinwords + index - oldpackedlength] = value; + if (oldpackedlength == newpackedlength) { + return (headersize + (packedsizeinwords + index - oldpackedlength)) * + sizeof(uint32_t); + } + // we now have exactly 32 to pack + uint32_t buffer[32]; + memcpy(buffer, in + packedsizeinwords, + 32 * sizeof(uint32_t)); // copy to a safe buffer + uint32_t *out = pack32[b](m, buffer, in + packedsizeinwords); + return (out - in + headersize) * sizeof(uint32_t); } +struct selectmetadata { + uint32_t m; + const uint32_t *in; + uint32_t b; + uint32_t length; +}; +static uint32_t fastselect(selectmetadata *s, size_t index) { + if (s->b == 32) { + return s->in[index]; + } + uint32_t packedlength = s->length / 32 * 32; + if (index > packedlength) { + uint32_t packedsizeinwords = packedlength * s->b / 32; + return s->in[packedsizeinwords + index - packedlength]; + } + const int bitoffset = static_cast(index) * s->b; /* how many bits */ + const int firstword = bitoffset / 32; + const int secondword = (bitoffset + s->b - 1) / 32; + const uint32_t firstpart = s->in[firstword] >> (bitoffset % 32); + const uint32_t mask = (1 << s->b) - 1; + if (firstword == secondword) { + /* easy common case*/ + return s->m + (firstpart & mask); + } else { + /* harder case where we need to combine two words */ + const uint32_t secondpart = s->in[firstword + 1]; + const int usablebitsinfirstword = 32 - (bitoffset % 32); + return s->m + ((firstpart | (secondpart << usablebitsinfirstword)) & mask); + } +} +// Performs a lower bound find in the encoded array. +// Returns the index +size_t findlowerbound(const uint32_t *in, uint32_t key, uint32_t *presult) { + selectmetadata s; + s.length = *in; + in++; + s.m = *in; + ++in; + uint32_t M = *in; + ++in; + s.b = bits(M - s.m); + s.in = in; + + int count = s.length; + int begin = 0; + uint32_t val; + while (count > 0) { + int step = count / 2; + val = fastselect(&s, begin + step); + if (val < key) { + begin += step + 1; + count -= step + 1; + } else + count = step; + } + *presult = fastselect(&s, begin); + return begin; +} -static __m128i iunpackFOR24(__m128i initOffset, const __m128i* in, uint32_t * _out) { +/*** + * BEGIN OF linear search + */ - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<24)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int search1_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) & 1; + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 1) & 1; + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 2) & 1; + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 3) & 1; + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 4) & 1; + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 5) & 1; + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 6) & 1; + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 7) & 1; + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 8) & 1; + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 9) & 1; + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 10) & 1; + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 11) & 1; + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 12) & 1; + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 13) & 1; + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 14) & 1; + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 15) & 1; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16) & 1; + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 17) & 1; + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 18) & 1; + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 19) & 1; + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 20) & 1; + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 21) & 1; + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 22) & 1; + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 23) & 1; + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 24) & 1; + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 25) & 1; + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 26) & 1; + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 27) & 1; + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 28) & 1; + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 29) & 1; + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 30) & 1; + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 31); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; +static int search2_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 2) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 4) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 6) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 8) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 10) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 12) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 14) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 16) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 18) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 20) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 22) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 24) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 26) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 28) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 30); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 2) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 4) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 6) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 8) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 10) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 12) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 14) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 16) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 18) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 20) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 22) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 24) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 26) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 28) % (1U << 2); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 30); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search3_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 3) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 6) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 9) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 12) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 15) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 18) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 21) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 24) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 27) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 1)) << (3 - 1); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 1) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 4) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 7) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 10) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 13) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 19) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 22) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 25) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 28) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 2)) << (3 - 2); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 2) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 5) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 8) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 11) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 14) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 17) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 20) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 23) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 26) % (1U << 3); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 29); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; } +static int search4_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 4) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 8) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 12) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 16) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 20) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 24) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 28); + ++in; + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 0) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 4) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 8) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 12) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 16) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 20) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 24) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 28); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 4) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 8) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 12) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 16) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 20) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 24) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 28); + ++in; + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 0) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 4) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 8) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 12) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 16) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 20) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 24) % (1U << 4); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 28); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search5_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 5) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 10) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 15) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 20) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 25) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 3)) << (5 - 3); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 3) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 8) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 13) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 18) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 23) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 1)) << (5 - 1); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 1) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 6) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 11) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 21) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 26) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 4)) << (5 - 4); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 4) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 9) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 14) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 19) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 24) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 2)) << (5 - 2); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 2) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 7) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 12) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 17) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 22) % (1U << 5); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 27); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search6_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 6) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 12) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 18) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 24) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 4)) << (6 - 4); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 4) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 10) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 16) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 22) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 2)) << (6 - 2); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 2) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 8) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 14) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 20) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 26); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 6) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 12) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 18) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 24) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 4)) << (6 - 4); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 4) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 10) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 16) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 22) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 2)) << (6 - 2); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 2) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 8) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 14) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 20) % (1U << 6); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 26); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} -static __m128i iunpackFOR25(__m128i initOffset, const __m128i* in, uint32_t * _out) { +static int search7_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 7) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 14) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 21) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 3)) << (7 - 3); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 3) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 10) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 17) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 24) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 6)) << (7 - 6); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 6) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 13) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 20) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 2)) << (7 - 2); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 2) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 9) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 23) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 5)) << (7 - 5); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 5) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 12) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 19) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 1)) << (7 - 1); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 1) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 8) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 15) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 22) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 4)) << (7 - 4); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 4) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 11) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 18) % (1U << 7); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 25); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<25)-1); +static int search8_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 8) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 16) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 24); + ++in; + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 0) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 8) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 16) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 24); + ++in; + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 0) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 8) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 16) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 24); + ++in; + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 0) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 8) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 16) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 24); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 8) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 16) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 24); + ++in; + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 0) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 8) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 16) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 24); + ++in; + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 0) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 8) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 16) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 24); + ++in; + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 0) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 8) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 16) % (1U << 8); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 24); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search9_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 9) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 18) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 4)) << (9 - 4); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 4) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 13) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 22) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 8)) << (9 - 8); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 8) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 17) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 3)) << (9 - 3); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 3) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 12) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 21) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 7)) << (9 - 7); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 7) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 2)) << (9 - 2); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 2) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 11) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 20) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 6)) << (9 - 6); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 6) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 15) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 1)) << (9 - 1); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 1) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 10) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 19) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 5)) << (9 - 5); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 5) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 14) % (1U << 9); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 23); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search10_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 10) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 20) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 8)) << (10 - 8); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 8) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 18) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 6)) << (10 - 6); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 6) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 16) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 4)) << (10 - 4); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 4) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 14) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 2)) << (10 - 2); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 2) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 12) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 22); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 10) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 20) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 8)) << (10 - 8); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 8) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 18) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 6)) << (10 - 6); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 6) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 16) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 4)) << (10 - 4); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 4) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 14) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 2)) << (10 - 2); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 2) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 12) % (1U << 10); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 22); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int search11_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 11) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 1)) << (11 - 1); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 1) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 12) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 2)) << (11 - 2); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 2) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 13) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 3)) << (11 - 3); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 3) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 14) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 4)) << (11 - 4); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 4) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 15) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 5)) << (11 - 5); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 5) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 6)) << (11 - 6); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 6) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 17) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 7)) << (11 - 7); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 7) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 18) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 8)) << (11 - 8); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 8) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 19) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 9)) << (11 - 9); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 9) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 20) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 10)) << (11 - 10); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 10) % (1U << 11); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 21); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); +static int search12_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 12) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 4)) << (12 - 4); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 4) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 16) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 8)) << (12 - 8); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 8) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 20); + ++in; + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 0) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 12) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 4)) << (12 - 4); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 4) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 16) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 8)) << (12 - 8); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 8) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 20); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 12) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 4)) << (12 - 4); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 4) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 16) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 8)) << (12 - 8); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 8) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 20); + ++in; + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 0) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 12) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 4)) << (12 - 4); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 4) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 16) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 8)) << (12 - 8); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 8) % (1U << 12); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 20); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int search13_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 13) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 7)) << (13 - 7); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 7) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 1)) << (13 - 1); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 1) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 14) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 8)) << (13 - 8); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 8) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 2)) << (13 - 2); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 2) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 15) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 9)) << (13 - 9); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 9) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 3)) << (13 - 3); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 3) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 10)) << (13 - 10); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 10) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 4)) << (13 - 4); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 4) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 17) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 11)) << (13 - 11); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 11) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 5)) << (13 - 5); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 5) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 18) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 12)) << (13 - 12); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 12) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 6)) << (13 - 6); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 6) % (1U << 13); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 19); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); +static int search14_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 14) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 10)) << (14 - 10); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 10) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 6)) << (14 - 6); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 6) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 2)) << (14 - 2); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 2) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 16) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 12)) << (14 - 12); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 12) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 8)) << (14 - 8); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 8) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 4)) << (14 - 4); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 4) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 18); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 14) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 10)) << (14 - 10); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 10) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 6)) << (14 - 6); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 6) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 2)) << (14 - 2); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 2) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 16) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 12)) << (14 - 12); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 12) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 8)) << (14 - 8); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 8) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 4)) << (14 - 4); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 4) % (1U << 14); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 18); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int search15_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 15) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 13)) << (15 - 13); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 13) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 11)) << (15 - 11); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 11) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 9)) << (15 - 9); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 9) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 7)) << (15 - 7); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 7) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 5)) << (15 - 5); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 5) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 3)) << (15 - 3); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 3) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 1)) << (15 - 1); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 1) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 14)) << (15 - 14); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 14) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 12)) << (15 - 12); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 12) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 10)) << (15 - 10); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 10) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 8)) << (15 - 8); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 8) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 6)) << (15 - 6); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 6) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 4)) << (15 - 4); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 4) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 2)) << (15 - 2); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 2) % (1U << 15); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 17); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); +static int search16_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 0) % (1U << 16); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 16); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int search17_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 17); + ++in; + *decoded |= ((*in) % (1U << 2)) << (17 - 2); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 2) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 4)) << (17 - 4); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 4) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 6)) << (17 - 6); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 6) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 8)) << (17 - 8); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 8) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 10)) << (17 - 10); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 10) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 12)) << (17 - 12); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 12) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 14)) << (17 - 14); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 14) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 16)) << (17 - 16); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 1)) << (17 - 1); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 1) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 3)) << (17 - 3); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 3) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 5)) << (17 - 5); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 5) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 7)) << (17 - 7); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 7) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 9)) << (17 - 9); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 9) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 11)) << (17 - 11); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 11) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 13)) << (17 - 13); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 13) % (1U << 17); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 15)) << (17 - 15); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 15); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int search18_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 4)) << (18 - 4); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 4) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 8)) << (18 - 8); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 8) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 12)) << (18 - 12); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 12) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 16)) << (18 - 16); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 2)) << (18 - 2); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 2) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 6)) << (18 - 6); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 6) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 10)) << (18 - 10); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 10) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 14)) << (18 - 14); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 14); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 4)) << (18 - 4); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 4) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 8)) << (18 - 8); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 8) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 12)) << (18 - 12); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 12) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 16)) << (18 - 16); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 2)) << (18 - 2); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 2) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 6)) << (18 - 6); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 6) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 10)) << (18 - 10); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 10) % (1U << 18); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 14)) << (18 - 14); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 14); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); +static int search19_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 6)) << (19 - 6); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 6) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 12)) << (19 - 12); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 12) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 18)) << (19 - 18); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 5)) << (19 - 5); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 5) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 11)) << (19 - 11); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 11) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 17)) << (19 - 17); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 17); + ++in; + *decoded |= ((*in) % (1U << 4)) << (19 - 4); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 4) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 10)) << (19 - 10); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 10) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 16)) << (19 - 16); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 3)) << (19 - 3); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 3) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 9)) << (19 - 9); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 9) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 15)) << (19 - 15); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 15); + ++in; + *decoded |= ((*in) % (1U << 2)) << (19 - 2); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 2) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 8)) << (19 - 8); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 8) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 14)) << (19 - 14); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 1)) << (19 - 1); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 1) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 7)) << (19 - 7); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 7) % (1U << 19); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 13)) << (19 - 13); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 13); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int search20_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 8)) << (20 - 8); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 8) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 16)) << (20 - 16); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 4)) << (20 - 4); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 4) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 12)) << (20 - 12); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 12); + ++in; + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 0) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 8)) << (20 - 8); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 8) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 16)) << (20 - 16); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 4)) << (20 - 4); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 4) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 12)) << (20 - 12); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 12); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 8)) << (20 - 8); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 8) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 16)) << (20 - 16); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 4)) << (20 - 4); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 4) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 12)) << (20 - 12); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 12); + ++in; + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 0) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 8)) << (20 - 8); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 8) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 16)) << (20 - 16); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 4)) << (20 - 4); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 4) % (1U << 20); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 12)) << (20 - 12); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 12); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); +static int search21_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 10)) << (21 - 10); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 10) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 20)) << (21 - 20); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 9)) << (21 - 9); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 9) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 19)) << (21 - 19); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 8)) << (21 - 8); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 8) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 18)) << (21 - 18); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 7)) << (21 - 7); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 7) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 17)) << (21 - 17); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 17); + ++in; + *decoded |= ((*in) % (1U << 6)) << (21 - 6); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 6) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 16)) << (21 - 16); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 5)) << (21 - 5); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 5) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 15)) << (21 - 15); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 15); + ++in; + *decoded |= ((*in) % (1U << 4)) << (21 - 4); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 4) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 14)) << (21 - 14); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 3)) << (21 - 3); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 3) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 13)) << (21 - 13); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 13); + ++in; + *decoded |= ((*in) % (1U << 2)) << (21 - 2); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 2) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 12)) << (21 - 12); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 1)) << (21 - 1); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 1) % (1U << 21); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 11)) << (21 - 11); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 11); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int search22_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 12)) << (22 - 12); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 2)) << (22 - 2); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 2) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 14)) << (22 - 14); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 4)) << (22 - 4); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 4) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 16)) << (22 - 16); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 6)) << (22 - 6); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 6) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 18)) << (22 - 18); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 8)) << (22 - 8); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 8) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 20)) << (22 - 20); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 10)) << (22 - 10); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 10); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 12)) << (22 - 12); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 2)) << (22 - 2); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 2) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 14)) << (22 - 14); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 4)) << (22 - 4); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 4) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 16)) << (22 - 16); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 6)) << (22 - 6); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 6) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 18)) << (22 - 18); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 8)) << (22 - 8); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 8) % (1U << 22); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 20)) << (22 - 20); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 10)) << (22 - 10); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 10); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search23_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 14)) << (23 - 14); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 5)) << (23 - 5); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 5) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 19)) << (23 - 19); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 10)) << (23 - 10); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 1)) << (23 - 1); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 1) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 15)) << (23 - 15); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 15); + ++in; + *decoded |= ((*in) % (1U << 6)) << (23 - 6); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 6) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 20)) << (23 - 20); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 11)) << (23 - 11); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 11); + ++in; + *decoded |= ((*in) % (1U << 2)) << (23 - 2); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 2) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 16)) << (23 - 16); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 7)) << (23 - 7); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 7) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 21)) << (23 - 21); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 12)) << (23 - 12); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 3)) << (23 - 3); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 3) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 17)) << (23 - 17); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 17); + ++in; + *decoded |= ((*in) % (1U << 8)) << (23 - 8); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 8) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 22)) << (23 - 22); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 13)) << (23 - 13); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 13); + ++in; + *decoded |= ((*in) % (1U << 4)) << (23 - 4); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 4) % (1U << 23); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 18)) << (23 - 18); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 9)) << (23 - 9); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 9); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - return initOffset; +static int search24_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 24); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 16)) << (24 - 16); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 8)) << (24 - 8); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 8); + ++in; + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 0) % (1U << 24); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 16)) << (24 - 16); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 8)) << (24 - 8); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 8); + ++in; + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 0) % (1U << 24); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 16)) << (24 - 16); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 8)) << (24 - 8); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 8); + ++in; + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 0) % (1U << 24); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 16)) << (24 - 16); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 8)) << (24 - 8); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 8); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 24); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 16)) << (24 - 16); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 8)) << (24 - 8); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 8); + ++in; + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 0) % (1U << 24); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 16)) << (24 - 16); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 8)) << (24 - 8); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 8); + ++in; + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 0) % (1U << 24); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 16)) << (24 - 16); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 8)) << (24 - 8); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 8); + ++in; + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 0) % (1U << 24); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 16)) << (24 - 16); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 8)) << (24 - 8); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 8); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search25_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 25); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 18)) << (25 - 18); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 11)) << (25 - 11); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 11); + ++in; + *decoded |= ((*in) % (1U << 4)) << (25 - 4); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 4) % (1U << 25); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 22)) << (25 - 22); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 15)) << (25 - 15); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 15); + ++in; + *decoded |= ((*in) % (1U << 8)) << (25 - 8); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 1)) << (25 - 1); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 1) % (1U << 25); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 19)) << (25 - 19); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 12)) << (25 - 12); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 5)) << (25 - 5); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 5) % (1U << 25); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 23)) << (25 - 23); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 16)) << (25 - 16); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 9)) << (25 - 9); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 9); + ++in; + *decoded |= ((*in) % (1U << 2)) << (25 - 2); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 2) % (1U << 25); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 20)) << (25 - 20); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 13)) << (25 - 13); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 13); + ++in; + *decoded |= ((*in) % (1U << 6)) << (25 - 6); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 6) % (1U << 25); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 24)) << (25 - 24); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 17)) << (25 - 17); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 17); + ++in; + *decoded |= ((*in) % (1U << 10)) << (25 - 10); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 3)) << (25 - 3); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 3) % (1U << 25); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 21)) << (25 - 21); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 14)) << (25 - 14); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 7)) << (25 - 7); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 7); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; } +static int search26_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 26); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 20)) << (26 - 20); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 14)) << (26 - 14); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 8)) << (26 - 8); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 2)) << (26 - 2); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 2) % (1U << 26); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 22)) << (26 - 22); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 16)) << (26 - 16); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 10)) << (26 - 10); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 4)) << (26 - 4); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 4) % (1U << 26); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 24)) << (26 - 24); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 18)) << (26 - 18); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 12)) << (26 - 12); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 6)) << (26 - 6); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 6); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 26); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 20)) << (26 - 20); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 14)) << (26 - 14); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 8)) << (26 - 8); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 2)) << (26 - 2); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 2) % (1U << 26); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 22)) << (26 - 22); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 16)) << (26 - 16); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 10)) << (26 - 10); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 4)) << (26 - 4); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 4) % (1U << 26); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 24)) << (26 - 24); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 18)) << (26 - 18); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 12)) << (26 - 12); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 6)) << (26 - 6); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 6); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search27_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 27); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 22)) << (27 - 22); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 17)) << (27 - 17); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 17); + ++in; + *decoded |= ((*in) % (1U << 12)) << (27 - 12); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 7)) << (27 - 7); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 7); + ++in; + *decoded |= ((*in) % (1U << 2)) << (27 - 2); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 2) % (1U << 27); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 24)) << (27 - 24); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 19)) << (27 - 19); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 14)) << (27 - 14); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 9)) << (27 - 9); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 9); + ++in; + *decoded |= ((*in) % (1U << 4)) << (27 - 4); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 4) % (1U << 27); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 26)) << (27 - 26); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 21)) << (27 - 21); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 16)) << (27 - 16); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 11)) << (27 - 11); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 11); + ++in; + *decoded |= ((*in) % (1U << 6)) << (27 - 6); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 6); + ++in; + *decoded |= ((*in) % (1U << 1)) << (27 - 1); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 1) % (1U << 27); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 23)) << (27 - 23); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 18)) << (27 - 18); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 13)) << (27 - 13); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 13); + ++in; + *decoded |= ((*in) % (1U << 8)) << (27 - 8); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 3)) << (27 - 3); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 3) % (1U << 27); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 25)) << (27 - 25); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 20)) << (27 - 20); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 15)) << (27 - 15); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 15); + ++in; + *decoded |= ((*in) % (1U << 10)) << (27 - 10); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 5)) << (27 - 5); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 5); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search28_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 28); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 24)) << (28 - 24); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 20)) << (28 - 20); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 16)) << (28 - 16); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 12)) << (28 - 12); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 8)) << (28 - 8); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 4)) << (28 - 4); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 4); + ++in; + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 0) % (1U << 28); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 24)) << (28 - 24); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 20)) << (28 - 20); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 16)) << (28 - 16); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 12)) << (28 - 12); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 8)) << (28 - 8); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 4)) << (28 - 4); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 4); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 28); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 24)) << (28 - 24); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 20)) << (28 - 20); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 16)) << (28 - 16); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 12)) << (28 - 12); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 8)) << (28 - 8); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 4)) << (28 - 4); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 4); + ++in; + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 0) % (1U << 28); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 24)) << (28 - 24); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 20)) << (28 - 20); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 16)) << (28 - 16); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 12)) << (28 - 12); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 8)) << (28 - 8); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 4)) << (28 - 4); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 4); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} -static __m128i iunpackFOR26(__m128i initOffset, const __m128i* in, uint32_t * _out) { +static int search29_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 29); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 26)) << (29 - 26); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 23)) << (29 - 23); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 20)) << (29 - 20); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 17)) << (29 - 17); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 17); + ++in; + *decoded |= ((*in) % (1U << 14)) << (29 - 14); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 11)) << (29 - 11); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 11); + ++in; + *decoded |= ((*in) % (1U << 8)) << (29 - 8); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 5)) << (29 - 5); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 5); + ++in; + *decoded |= ((*in) % (1U << 2)) << (29 - 2); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 2) % (1U << 29); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 28)) << (29 - 28); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 25)) << (29 - 25); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 22)) << (29 - 22); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 19)) << (29 - 19); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 16)) << (29 - 16); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 13)) << (29 - 13); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 13); + ++in; + *decoded |= ((*in) % (1U << 10)) << (29 - 10); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 7)) << (29 - 7); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 7); + ++in; + *decoded |= ((*in) % (1U << 4)) << (29 - 4); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 4); + ++in; + *decoded |= ((*in) % (1U << 1)) << (29 - 1); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 1) % (1U << 29); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 27)) << (29 - 27); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 24)) << (29 - 24); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 21)) << (29 - 21); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 18)) << (29 - 18); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 15)) << (29 - 15); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 15); + ++in; + *decoded |= ((*in) % (1U << 12)) << (29 - 12); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 9)) << (29 - 9); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 9); + ++in; + *decoded |= ((*in) % (1U << 6)) << (29 - 6); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 6); + ++in; + *decoded |= ((*in) % (1U << 3)) << (29 - 3); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 3); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<26)-1); +static int search30_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 30); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 28)) << (30 - 28); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 26)) << (30 - 26); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 24)) << (30 - 24); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 22)) << (30 - 22); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 20)) << (30 - 20); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 18)) << (30 - 18); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 16)) << (30 - 16); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 14)) << (30 - 14); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 12)) << (30 - 12); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 10)) << (30 - 10); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 8)) << (30 - 8); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 6)) << (30 - 6); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 6); + ++in; + *decoded |= ((*in) % (1U << 4)) << (30 - 4); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 4); + ++in; + *decoded |= ((*in) % (1U << 2)) << (30 - 2); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 2); + ++in; + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 0) % (1U << 30); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 28)) << (30 - 28); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 26)) << (30 - 26); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 24)) << (30 - 24); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 22)) << (30 - 22); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 20)) << (30 - 20); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 18)) << (30 - 18); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 16)) << (30 - 16); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 14)) << (30 - 14); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 12)) << (30 - 12); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 10)) << (30 - 10); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 8)) << (30 - 8); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 6)) << (30 - 6); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 6); + ++in; + *decoded |= ((*in) % (1U << 4)) << (30 - 4); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 4); + ++in; + *decoded |= ((*in) % (1U << 2)) << (30 - 2); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 2); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search31_32(uint32_t base, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + + *decoded = ((*in) >> 0) % (1U << 31); + *decoded += base; + if (*decoded >= target) + return 0; + *decoded = ((*in) >> 31); + ++in; + *decoded |= ((*in) % (1U << 30)) << (31 - 30); + *decoded += base; + if (*decoded >= target) + return 1; + *decoded = ((*in) >> 30); + ++in; + *decoded |= ((*in) % (1U << 29)) << (31 - 29); + *decoded += base; + if (*decoded >= target) + return 2; + *decoded = ((*in) >> 29); + ++in; + *decoded |= ((*in) % (1U << 28)) << (31 - 28); + *decoded += base; + if (*decoded >= target) + return 3; + *decoded = ((*in) >> 28); + ++in; + *decoded |= ((*in) % (1U << 27)) << (31 - 27); + *decoded += base; + if (*decoded >= target) + return 4; + *decoded = ((*in) >> 27); + ++in; + *decoded |= ((*in) % (1U << 26)) << (31 - 26); + *decoded += base; + if (*decoded >= target) + return 5; + *decoded = ((*in) >> 26); + ++in; + *decoded |= ((*in) % (1U << 25)) << (31 - 25); + *decoded += base; + if (*decoded >= target) + return 6; + *decoded = ((*in) >> 25); + ++in; + *decoded |= ((*in) % (1U << 24)) << (31 - 24); + *decoded += base; + if (*decoded >= target) + return 7; + *decoded = ((*in) >> 24); + ++in; + *decoded |= ((*in) % (1U << 23)) << (31 - 23); + *decoded += base; + if (*decoded >= target) + return 8; + *decoded = ((*in) >> 23); + ++in; + *decoded |= ((*in) % (1U << 22)) << (31 - 22); + *decoded += base; + if (*decoded >= target) + return 9; + *decoded = ((*in) >> 22); + ++in; + *decoded |= ((*in) % (1U << 21)) << (31 - 21); + *decoded += base; + if (*decoded >= target) + return 10; + *decoded = ((*in) >> 21); + ++in; + *decoded |= ((*in) % (1U << 20)) << (31 - 20); + *decoded += base; + if (*decoded >= target) + return 11; + *decoded = ((*in) >> 20); + ++in; + *decoded |= ((*in) % (1U << 19)) << (31 - 19); + *decoded += base; + if (*decoded >= target) + return 12; + *decoded = ((*in) >> 19); + ++in; + *decoded |= ((*in) % (1U << 18)) << (31 - 18); + *decoded += base; + if (*decoded >= target) + return 13; + *decoded = ((*in) >> 18); + ++in; + *decoded |= ((*in) % (1U << 17)) << (31 - 17); + *decoded += base; + if (*decoded >= target) + return 14; + *decoded = ((*in) >> 17); + ++in; + *decoded |= ((*in) % (1U << 16)) << (31 - 16); + *decoded += base; + if (*decoded >= target) + return 15; + *decoded = ((*in) >> 16); + ++in; + *decoded |= ((*in) % (1U << 15)) << (31 - 15); + *decoded += base; + if (*decoded >= target) + return 16; + *decoded = ((*in) >> 15); + ++in; + *decoded |= ((*in) % (1U << 14)) << (31 - 14); + *decoded += base; + if (*decoded >= target) + return 17; + *decoded = ((*in) >> 14); + ++in; + *decoded |= ((*in) % (1U << 13)) << (31 - 13); + *decoded += base; + if (*decoded >= target) + return 18; + *decoded = ((*in) >> 13); + ++in; + *decoded |= ((*in) % (1U << 12)) << (31 - 12); + *decoded += base; + if (*decoded >= target) + return 19; + *decoded = ((*in) >> 12); + ++in; + *decoded |= ((*in) % (1U << 11)) << (31 - 11); + *decoded += base; + if (*decoded >= target) + return 20; + *decoded = ((*in) >> 11); + ++in; + *decoded |= ((*in) % (1U << 10)) << (31 - 10); + *decoded += base; + if (*decoded >= target) + return 21; + *decoded = ((*in) >> 10); + ++in; + *decoded |= ((*in) % (1U << 9)) << (31 - 9); + *decoded += base; + if (*decoded >= target) + return 22; + *decoded = ((*in) >> 9); + ++in; + *decoded |= ((*in) % (1U << 8)) << (31 - 8); + *decoded += base; + if (*decoded >= target) + return 23; + *decoded = ((*in) >> 8); + ++in; + *decoded |= ((*in) % (1U << 7)) << (31 - 7); + *decoded += base; + if (*decoded >= target) + return 24; + *decoded = ((*in) >> 7); + ++in; + *decoded |= ((*in) % (1U << 6)) << (31 - 6); + *decoded += base; + if (*decoded >= target) + return 25; + *decoded = ((*in) >> 6); + ++in; + *decoded |= ((*in) % (1U << 5)) << (31 - 5); + *decoded += base; + if (*decoded >= target) + return 26; + *decoded = ((*in) >> 5); + ++in; + *decoded |= ((*in) % (1U << 4)) << (31 - 4); + *decoded += base; + if (*decoded >= target) + return 27; + *decoded = ((*in) >> 4); + ++in; + *decoded |= ((*in) % (1U << 3)) << (31 - 3); + *decoded += base; + if (*decoded >= target) + return 28; + *decoded = ((*in) >> 3); + ++in; + *decoded |= ((*in) % (1U << 2)) << (31 - 2); + *decoded += base; + if (*decoded >= target) + return 29; + *decoded = ((*in) >> 2); + ++in; + *decoded |= ((*in) % (1U << 1)) << (31 - 1); + *decoded += base; + if (*decoded >= target) + return 30; + *decoded = ((*in) >> 1); + ++in; + *decoded += base; + if (*decoded >= target) + return 31; + return -1; +} +static int search32_32(uint32_t, const uint32_t *in, uint32_t target, + uint32_t *decoded) { + for (int k = 0; k < 32; ++k) { + if (in[k] >= target) { + *decoded = in[k]; + return k; + } + } + return -1; +} - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static int searchlinearfor(uint32_t base, const uint32_t *in, uint32_t bit, + uint32_t target, uint32_t *presult) { + switch (bit) { + case 0: + if (target < base) + return -1; + else { + *presult = base; + return 0; + } - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + case 1: + return search1_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 2: + return search2_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + case 3: + return search3_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 4: + return search4_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + case 5: + return search5_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 6: + return search6_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 7: + return search7_32(base, in, target, presult); + case 8: + return search8_32(base, in, target, presult); - return initOffset; + case 9: + return search9_32(base, in, target, presult); -} + case 10: + return search10_32(base, in, target, presult); + case 11: + return search11_32(base, in, target, presult); + case 12: + return search12_32(base, in, target, presult); + case 13: + return search13_32(base, in, target, presult); -static __m128i iunpackFOR27(__m128i initOffset, const __m128i* in, uint32_t * _out) { + case 14: + return search14_32(base, in, target, presult); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<27)-1); + case 15: + return search15_32(base, in, target, presult); + case 16: + return search16_32(base, in, target, presult); + case 17: + return search17_32(base, in, target, presult); - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 18: + return search18_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + case 19: + return search19_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 20: + return search20_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + case 21: + return search21_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 22: + return search22_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + case 23: + return search23_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 24: + return search24_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + case 25: + return search25_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 26: + return search26_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + case 27: + return search27_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 28: + return search28_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + case 29: + return search29_32(base, in, target, presult); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 30: + return search30_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + case 31: + return search31_32(base, in, target, presult); - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; + case 32: + return search32_32(base, in, target, presult); + default: + break; + } + return (-1); } +/*** + * END OF linear search + */ +size_t findlower_linear(const uint32_t *in, uint32_t key, uint32_t *presult) { + selectmetadata s; + s.length = *in; + in++; + s.m = *in; + ++in; + uint32_t M = *in; + ++in; + s.b = bits(M - s.m); + s.in = in; + + for (uint32_t k = 0, offset = 0; k < s.length; k += 32, offset += s.b) { + if (32 + k > s.length) { + for (uint32_t z = 0; z < s.length - k; ++z) { + if (*(s.in + offset + z) >= key) { + *presult = *(s.in + offset + z); + return k + z; + } + } + return -1; + } + int x = searchlinearfor(s.m, s.in + offset, s.b, key, presult); + if (x >= 0) { + return k + x; + } + } + return -1; +} -static __m128i iunpackFOR28(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<28)-1); +// Performs a lower bound find in the encoded array. +// Returns the index +size_t SIMDCompressionLib::FrameOfReference::findLowerBound(const uint32_t *in, + const size_t, + uint32_t key, + uint32_t *presult) { + const bool USES_BINARY = true; + if (USES_BINARY) { + return findlowerbound(in, key, presult); + } else { + return findlower_linear(in, key, presult); + } +} +// Returns a decompressed value in an encoded array +uint32_t SIMDCompressionLib::FrameOfReference::select(const uint32_t *in, + size_t index) { + uint32_t length = *in; + in++; + uint32_t m = *in; + ++in; + uint32_t M = *in; + ++in; + uint32_t b = bits(M - m); + if (b == 32) { + return in[index]; + } + uint32_t packedlength = length / 32 * 32; + if (index > packedlength) { + uint32_t packedsizeinwords = packedlength * b / 32; + return in[packedsizeinwords + index - packedlength]; + } + const int bitoffset = static_cast(index) * b; /* how many bits */ + const int firstword = bitoffset / 32; + const int secondword = (bitoffset + b - 1) / 32; + const uint32_t firstpart = in[firstword] >> (bitoffset % 32); + const uint32_t mask = (1 << b) - 1; + if (firstword == secondword) { + /* easy common case*/ + return m + (firstpart & mask); + } else { + /* harder case where we need to combine two words */ + const uint32_t secondpart = in[firstword + 1]; + const int usablebitsinfirstword = 32 - (bitoffset % 32); + return m + ((firstpart | (secondpart << usablebitsinfirstword)) & mask); + } +} +/******************************** + * SIMD VERSION FOLLOWS + ******************************** + ******************************** + ********************************/ - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static __m128i iunpackFOR0(__m128i initOffset, const __m128i *_in, + uint32_t *_out) { + __m128i *out = (__m128i *)(_out); + int i; + (void)_in; + for (i = 0; i < 8; ++i) { + _mm_storeu_si128(out++, initOffset); + _mm_storeu_si128(out++, initOffset); + _mm_storeu_si128(out++, initOffset); + _mm_storeu_si128(out++, initOffset); + } + + return initOffset; +} - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); +static void ipackFOR0(__m128i initOffset, const uint32_t *_in, __m128i *out) { + (void)initOffset; + (void)_in; + (void)out; +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static void ipackFOR1(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); +} - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); +static void ipackFOR2(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static void ipackFOR3(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); +} - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); +static void ipackFOR4(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static void ipackFOR5(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); +} - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; +static void ipackFOR6(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); +} +static void ipackFOR7(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); } +static void ipackFOR8(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); +} +static void ipackFOR9(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); +} +static void ipackFOR10(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); +} -static __m128i iunpackFOR29(__m128i initOffset, const __m128i* in, uint32_t * _out) { +static void ipackFOR11(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); +} - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<29)-1); +static void ipackFOR12(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); +} +static void ipackFOR13(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); +} +static void ipackFOR14(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); +} - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static void ipackFOR15(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); +} - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); +static void ipackFOR16(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static void ipackFOR17(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); +} - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); +static void ipackFOR18(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static void ipackFOR19(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); +} - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); +static void ipackFOR20(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static void ipackFOR21(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); +} - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); +static void ipackFOR22(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); +static void ipackFOR23(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); +} - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); +static void ipackFOR24(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); +static void ipackFOR25(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); +static void ipackFOR26(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - - return initOffset; +static void ipackFOR27(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); +} +static void ipackFOR28(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); } +static void ipackFOR29(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); +} +static void ipackFOR30(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); +} +static void ipackFOR31(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); +} -static __m128i iunpackFOR30(__m128i initOffset, const __m128i* in, uint32_t * _out) { +static void ipackFOR32(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = (const __m128i *)(_in); + __m128i OutReg; - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<30)-1); + __m128i InReg = _mm_loadu_si128(in); + (void)initOffset; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - return initOffset; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); -} + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); -static __m128i iunpackFOR31(__m128i initOffset, const __m128i* in, uint32_t * _out) { + ++out; + ++in; + InReg = _mm_loadu_si128(in); - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U<<31)-1); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - ++in; - InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); - - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = tmp; - OutReg = _mm_add_epi32(OutReg, initOffset); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); - return initOffset; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); -} + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); -static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) { - __m128i * mout = (__m128i *)_out; - __m128i invec; - size_t k; - (void) initvalue; - for(k = 0; k < 128/4; ++k) { - invec = _mm_loadu_si128(in++); - _mm_storeu_si128(mout++, invec); - } - return invec; -} + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); -void simdpackFOR(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { - __m128i initOffset = _mm_set1_epi32 (initvalue); - switch(bit) { - case 0: - ipackFOR0(initOffset,in,out); - break; + ++out; + ++in; + InReg = _mm_loadu_si128(in); - case 1: - ipackFOR1(initOffset,in,out); - break; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - case 2: - ipackFOR2(initOffset,in,out); - break; + ++out; + ++in; + InReg = _mm_loadu_si128(in); - case 3: - ipackFOR3(initOffset,in,out); - break; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - case 4: - ipackFOR4(initOffset,in,out); - break; + ++out; + ++in; + InReg = _mm_loadu_si128(in); - case 5: - ipackFOR5(initOffset,in,out); - break; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - case 6: - ipackFOR6(initOffset,in,out); - break; + ++out; + ++in; + InReg = _mm_loadu_si128(in); - case 7: - ipackFOR7(initOffset,in,out); - break; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - case 8: - ipackFOR8(initOffset,in,out); - break; + ++out; + ++in; + InReg = _mm_loadu_si128(in); - case 9: - ipackFOR9(initOffset,in,out); - break; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - case 10: - ipackFOR10(initOffset,in,out); - break; + ++out; + ++in; + InReg = _mm_loadu_si128(in); - case 11: - ipackFOR11(initOffset,in,out); - break; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - case 12: - ipackFOR12(initOffset,in,out); - break; + ++out; + ++in; + InReg = _mm_loadu_si128(in); - case 13: - ipackFOR13(initOffset,in,out); - break; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); - case 14: - ipackFOR14(initOffset,in,out); - break; + ++out; + ++in; + InReg = _mm_loadu_si128(in); - case 15: - ipackFOR15(initOffset,in,out); - break; + OutReg = InReg; + _mm_storeu_si128(out, OutReg); +} - case 16: - ipackFOR16(initOffset,in,out); - break; +static __m128i iunpackFOR1(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 1) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 17: - ipackFOR17(initOffset,in,out); - break; +static __m128i iunpackFOR2(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 18: - ipackFOR18(initOffset,in,out); - break; +static __m128i iunpackFOR3(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 19: - ipackFOR19(initOffset,in,out); - break; +static __m128i iunpackFOR4(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 20: - ipackFOR20(initOffset,in,out); - break; +static __m128i iunpackFOR5(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 21: - ipackFOR21(initOffset,in,out); - break; +static __m128i iunpackFOR6(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 22: - ipackFOR22(initOffset,in,out); - break; +static __m128i iunpackFOR7(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 23: - ipackFOR23(initOffset,in,out); - break; +static __m128i iunpackFOR8(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 24: - ipackFOR24(initOffset,in,out); - break; +static __m128i iunpackFOR9(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 25: - ipackFOR25(initOffset,in,out); - break; +static __m128i iunpackFOR10(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 26: - ipackFOR26(initOffset,in,out); - break; +static __m128i iunpackFOR11(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 27: - ipackFOR27(initOffset,in,out); - break; +static __m128i iunpackFOR12(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 28: - ipackFOR28(initOffset,in,out); - break; +static __m128i iunpackFOR13(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 29: - ipackFOR29(initOffset,in,out); - break; +static __m128i iunpackFOR14(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 30: - ipackFOR30(initOffset,in,out); - break; +static __m128i iunpackFOR15(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 31: - ipackFOR31(initOffset,in,out); - break; +static __m128i iunpackFOR16(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 32: - ipackFOR32(initOffset,in,out); - break; +static __m128i iunpackFOR17(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - default: - break; - } +static __m128i iunpackFOR18(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; } +static __m128i iunpackFOR19(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +static __m128i iunpackFOR20(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} +static __m128i iunpackFOR21(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} -void simdunpackFOR(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) { - __m128i initOffset = _mm_set1_epi32 (initvalue); - switch(bit) { - case 0: - iunpackFOR0(initOffset, in,out); - break; +static __m128i iunpackFOR22(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 1: - iunpackFOR1(initOffset, in,out); - break; +static __m128i iunpackFOR23(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 2: - iunpackFOR2(initOffset, in,out); - break; +static __m128i iunpackFOR24(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 3: - iunpackFOR3(initOffset, in,out); - break; +static __m128i iunpackFOR25(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 4: - iunpackFOR4(initOffset, in,out); - break; +static __m128i iunpackFOR26(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 5: - iunpackFOR5(initOffset, in,out); - break; +static __m128i iunpackFOR27(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 6: - iunpackFOR6(initOffset, in,out); - break; +static __m128i iunpackFOR28(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 7: - iunpackFOR7(initOffset, in,out); - break; +static __m128i iunpackFOR29(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 8: - iunpackFOR8(initOffset, in,out); - break; +static __m128i iunpackFOR30(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 9: - iunpackFOR9(initOffset, in,out); - break; +static __m128i iunpackFOR31(__m128i initOffset, const __m128i *in, + uint32_t *_out) { + + __m128i *out = (__m128i *)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + ++in; + InReg = _mm_loadu_si128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + return initOffset; +} - case 10: - iunpackFOR10(initOffset, in,out); - break; +static __m128i iunpackFOR32(__m128i initvalue, const __m128i *in, + uint32_t *_out) { + __m128i *mout = (__m128i *)_out; + __m128i invec; + size_t k; + (void)initvalue; + for (k = 0; k < 128 / 4; ++k) { + invec = _mm_loadu_si128(in++); + _mm_storeu_si128(mout++, invec); + } + return invec; +} - case 11: - iunpackFOR11(initOffset, in,out); - break; +void simdpackFOR(uint32_t initvalue, const uint32_t *in, __m128i *out, + const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32(initvalue); + switch (bit) { + case 0: + ipackFOR0(initOffset, in, out); + break; - case 12: - iunpackFOR12(initOffset, in,out); - break; + case 1: + ipackFOR1(initOffset, in, out); + break; - case 13: - iunpackFOR13(initOffset, in,out); - break; + case 2: + ipackFOR2(initOffset, in, out); + break; - case 14: - iunpackFOR14(initOffset, in,out); - break; + case 3: + ipackFOR3(initOffset, in, out); + break; - case 15: - iunpackFOR15(initOffset, in,out); - break; + case 4: + ipackFOR4(initOffset, in, out); + break; - case 16: - iunpackFOR16(initOffset, in,out); - break; + case 5: + ipackFOR5(initOffset, in, out); + break; - case 17: - iunpackFOR17(initOffset, in,out); - break; + case 6: + ipackFOR6(initOffset, in, out); + break; - case 18: - iunpackFOR18(initOffset, in,out); - break; + case 7: + ipackFOR7(initOffset, in, out); + break; - case 19: - iunpackFOR19(initOffset, in,out); - break; + case 8: + ipackFOR8(initOffset, in, out); + break; + + case 9: + ipackFOR9(initOffset, in, out); + break; + + case 10: + ipackFOR10(initOffset, in, out); + break; + + case 11: + ipackFOR11(initOffset, in, out); + break; + + case 12: + ipackFOR12(initOffset, in, out); + break; + + case 13: + ipackFOR13(initOffset, in, out); + break; + + case 14: + ipackFOR14(initOffset, in, out); + break; + + case 15: + ipackFOR15(initOffset, in, out); + break; + + case 16: + ipackFOR16(initOffset, in, out); + break; + + case 17: + ipackFOR17(initOffset, in, out); + break; + + case 18: + ipackFOR18(initOffset, in, out); + break; + + case 19: + ipackFOR19(initOffset, in, out); + break; + + case 20: + ipackFOR20(initOffset, in, out); + break; + + case 21: + ipackFOR21(initOffset, in, out); + break; + + case 22: + ipackFOR22(initOffset, in, out); + break; + + case 23: + ipackFOR23(initOffset, in, out); + break; + + case 24: + ipackFOR24(initOffset, in, out); + break; + + case 25: + ipackFOR25(initOffset, in, out); + break; - case 20: - iunpackFOR20(initOffset, in,out); - break; + case 26: + ipackFOR26(initOffset, in, out); + break; - case 21: - iunpackFOR21(initOffset, in,out); - break; + case 27: + ipackFOR27(initOffset, in, out); + break; - case 22: - iunpackFOR22(initOffset, in,out); - break; + case 28: + ipackFOR28(initOffset, in, out); + break; - case 23: - iunpackFOR23(initOffset, in,out); - break; + case 29: + ipackFOR29(initOffset, in, out); + break; - case 24: - iunpackFOR24(initOffset, in,out); - break; + case 30: + ipackFOR30(initOffset, in, out); + break; - case 25: - iunpackFOR25(initOffset, in,out); - break; + case 31: + ipackFOR31(initOffset, in, out); + break; - case 26: - iunpackFOR26(initOffset, in,out); - break; + case 32: + ipackFOR32(initOffset, in, out); + break; - case 27: - iunpackFOR27(initOffset, in,out); - break; + default: + break; + } +} - case 28: - iunpackFOR28(initOffset, in,out); - break; +void simdunpackFOR(uint32_t initvalue, const __m128i *in, uint32_t *out, + const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32(initvalue); + switch (bit) { + case 0: + iunpackFOR0(initOffset, in, out); + break; - case 29: - iunpackFOR29(initOffset, in,out); - break; + case 1: + iunpackFOR1(initOffset, in, out); + break; - case 30: - iunpackFOR30(initOffset, in,out); - break; + case 2: + iunpackFOR2(initOffset, in, out); + break; - case 31: - iunpackFOR31(initOffset, in,out); - break; + case 3: + iunpackFOR3(initOffset, in, out); + break; - case 32: - iunpackFOR32(initOffset, in,out); - break; + case 4: + iunpackFOR4(initOffset, in, out); + break; - default: - break; - } -} + case 5: + iunpackFOR5(initOffset, in, out); + break; + case 6: + iunpackFOR6(initOffset, in, out); + break; + case 7: + iunpackFOR7(initOffset, in, out); + break; -// Returns a decompressed value in an encoded array -uint32_t SIMDCompressionLib::SIMDFrameOfReference::select(const uint32_t *in, size_t index) { - //uint32_t length = *in; - in ++; - uint32_t m = *in; - ++in; - uint32_t M = *in; - ++in; - uint32_t bit = bits(M-m); - if(bit == 32) { - return in[index]; - } - in += index / 128 * 4 * bit; - const int slot = index % 128; - const int lane = slot % 4; /* we have 4 interleaved lanes */ - const int bitsinlane = (slot / 4) * bit; /* how many bits in lane */ - const int firstwordinlane = bitsinlane / 32; - const int secondwordinlane = (bitsinlane + bit - 1) / 32; - const uint32_t firstpart = in[4 * firstwordinlane + lane] - >> (bitsinlane % 32); - const uint32_t mask = (1 << bit) - 1; - if (firstwordinlane == secondwordinlane) { - /* easy common case*/ - return m + (firstpart & mask); - } else { - /* harder case where we need to combine two words */ - const uint32_t secondpart = in[4 * firstwordinlane + 4 + lane]; - const int usablebitsinfirstword = 32 - (bitsinlane % 32); - return m - + ((firstpart | (secondpart << usablebitsinfirstword)) - & mask); - } + case 8: + iunpackFOR8(initOffset, in, out); + break; + + case 9: + iunpackFOR9(initOffset, in, out); + break; + + case 10: + iunpackFOR10(initOffset, in, out); + break; + + case 11: + iunpackFOR11(initOffset, in, out); + break; + + case 12: + iunpackFOR12(initOffset, in, out); + break; + + case 13: + iunpackFOR13(initOffset, in, out); + break; + + case 14: + iunpackFOR14(initOffset, in, out); + break; + + case 15: + iunpackFOR15(initOffset, in, out); + break; + + case 16: + iunpackFOR16(initOffset, in, out); + break; + + case 17: + iunpackFOR17(initOffset, in, out); + break; + + case 18: + iunpackFOR18(initOffset, in, out); + break; + + case 19: + iunpackFOR19(initOffset, in, out); + break; + + case 20: + iunpackFOR20(initOffset, in, out); + break; + + case 21: + iunpackFOR21(initOffset, in, out); + break; + + case 22: + iunpackFOR22(initOffset, in, out); + break; + + case 23: + iunpackFOR23(initOffset, in, out); + break; + + case 24: + iunpackFOR24(initOffset, in, out); + break; + + case 25: + iunpackFOR25(initOffset, in, out); + break; -} + case 26: + iunpackFOR26(initOffset, in, out); + break; + case 27: + iunpackFOR27(initOffset, in, out); + break; + case 28: + iunpackFOR28(initOffset, in, out); + break; + case 29: + iunpackFOR29(initOffset, in, out); + break; + case 30: + iunpackFOR30(initOffset, in, out); + break; + case 31: + iunpackFOR31(initOffset, in, out); + break; -static uint32_t simdfastselect(const uint32_t * in, int b, uint32_t m, size_t index) { - //if (b == 32) { - // return in[index]; - //} - const int lane = index % 4; /* we have 4 interleaved lanes */ - const int bitsinlane = static_cast(index / 4) * b; /* how many bits in lane */ - const int firstwordinlane = bitsinlane / 32; - const int secondwordinlane = (bitsinlane + b - 1) / 32; - const uint32_t firstpart = in[4 * firstwordinlane + lane] - >> (bitsinlane % 32); - const uint32_t mask = (1 << b) - 1; - if (firstwordinlane == secondwordinlane) { - /* easy common case*/ - return m + (firstpart & mask); - } else { - /* harder case where we need to combine two words */ - const uint32_t secondpart = in[4 * firstwordinlane + 4 + lane]; - const int usablebitsinfirstword = 32 - (bitsinlane % 32); - return m - + ((firstpart | (secondpart << usablebitsinfirstword)) & mask); - } + case 32: + iunpackFOR32(initOffset, in, out); + break; + default: + break; + } } +// Returns a decompressed value in an encoded array +uint32_t SIMDCompressionLib::SIMDFrameOfReference::select(const uint32_t *in, + size_t index) { + // uint32_t length = *in; + in++; + uint32_t m = *in; + ++in; + uint32_t M = *in; + ++in; + uint32_t bit = bits(M - m); + if (bit == 32) { + return in[index]; + } + in += index / 128 * 4 * bit; + const int slot = index % 128; + const int lane = slot % 4; /* we have 4 interleaved lanes */ + const int bitsinlane = (slot / 4) * bit; /* how many bits in lane */ + const int firstwordinlane = bitsinlane / 32; + const int secondwordinlane = (bitsinlane + bit - 1) / 32; + const uint32_t firstpart = + in[4 * firstwordinlane + lane] >> (bitsinlane % 32); + const uint32_t mask = (1 << bit) - 1; + if (firstwordinlane == secondwordinlane) { + /* easy common case*/ + return m + (firstpart & mask); + } else { + /* harder case where we need to combine two words */ + const uint32_t secondpart = in[4 * firstwordinlane + 4 + lane]; + const int usablebitsinfirstword = 32 - (bitsinlane % 32); + return m + ((firstpart | (secondpart << usablebitsinfirstword)) & mask); + } +} +static uint32_t simdfastselect(const uint32_t *in, int b, uint32_t m, + size_t index) { + // if (b == 32) { + // return in[index]; + //} + const int lane = index % 4; /* we have 4 interleaved lanes */ + const int bitsinlane = + static_cast(index / 4) * b; /* how many bits in lane */ + const int firstwordinlane = bitsinlane / 32; + const int secondwordinlane = (bitsinlane + b - 1) / 32; + const uint32_t firstpart = + in[4 * firstwordinlane + lane] >> (bitsinlane % 32); + const uint32_t mask = (1 << b) - 1; + if (firstwordinlane == secondwordinlane) { + /* easy common case*/ + return m + (firstpart & mask); + } else { + /* harder case where we need to combine two words */ + const uint32_t secondpart = in[4 * firstwordinlane + 4 + lane]; + const int usablebitsinfirstword = 32 - (bitsinlane % 32); + return m + ((firstpart | (secondpart << usablebitsinfirstword)) & mask); + } +} // Performs a lower bound find in the encoded array. // Returns the index -size_t SIMDCompressionLib::SIMDFrameOfReference::findLowerBound(const uint32_t *in, const size_t , uint32_t key, - uint32_t *presult) { - uint32_t length = *in; - in ++; - uint32_t m = *in; - ++in; - uint32_t M = *in; - ++in; - uint32_t b = bits(M-m); - uint32_t count = length; - uint32_t begin = 0; - uint32_t val; - if(b == 32) { // we handle the special case separately - while (count > 0) { - uint32_t step = count / 2; - val = in[begin+step]; - if (val < key) { - begin += step + 1; - count -= step + 1; - } else count = step; - } - *presult = in[begin]; - return begin; - } +size_t SIMDCompressionLib::SIMDFrameOfReference::findLowerBound( + const uint32_t *in, const size_t, uint32_t key, uint32_t *presult) { + uint32_t length = *in; + in++; + uint32_t m = *in; + ++in; + uint32_t M = *in; + ++in; + uint32_t b = bits(M - m); + uint32_t count = length; + uint32_t begin = 0; + uint32_t val; + if (b == 32) { // we handle the special case separately while (count > 0) { - uint32_t step = count / 2; - val = simdfastselect(in,b,m, begin+step); - if (val < key) { - begin += step + 1; - count -= step + 1; - } else count = step; + uint32_t step = count / 2; + val = in[begin + step]; + if (val < key) { + begin += step + 1; + count -= step + 1; + } else + count = step; } - *presult = simdfastselect(in,b,m, begin); + *presult = in[begin]; return begin; + } + while (count > 0) { + uint32_t step = count / 2; + val = simdfastselect(in, b, m, begin + step); + if (val < key) { + begin += step + 1; + count -= step + 1; + } else + count = step; + } + *presult = simdfastselect(in, b, m, begin); + return begin; } static uint32_t minasint(const __m128i accumulator) { - const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ - const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ - return _mm_cvtsi128_si32(_tmp2); + const __m128i _tmp1 = _mm_min_epu32( + _mm_srli_si128(accumulator, 8), + accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = + _mm_min_epu32(_mm_srli_si128(_tmp1, 4), + _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + return _mm_cvtsi128_si32(_tmp2); } static uint32_t maxasint(const __m128i accumulator) { - const __m128i _tmp1 = _mm_max_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ - const __m128i _tmp2 = _mm_max_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ - return _mm_cvtsi128_si32(_tmp2); + const __m128i _tmp1 = _mm_max_epu32( + _mm_srli_si128(accumulator, 8), + accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = + _mm_max_epu32(_mm_srli_si128(_tmp1, 4), + _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + return _mm_cvtsi128_si32(_tmp2); } - - -static void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) { - uint32_t lengthdividedby4 = length / 4; - uint32_t offset = lengthdividedby4 * 4; - uint32_t k; - *getmin = 0xFFFFFFFF; - *getmax = 0; - if (lengthdividedby4 > 0) { - const __m128i* pin = (const __m128i*)(in); - __m128i minaccumulator = _mm_loadu_si128(pin); - __m128i maxaccumulator = minaccumulator; - uint32_t k = 1; - for(; 4*k < lengthdividedby4 * 4; ++k) { - __m128i newvec = _mm_loadu_si128(pin+k); - minaccumulator = _mm_min_epu32(minaccumulator,newvec); - maxaccumulator = _mm_max_epu32(maxaccumulator,newvec); - } - *getmin = minasint(minaccumulator); - *getmax = maxasint(maxaccumulator); - } - for (k = offset; k < length; ++k) { - if (in[k] < *getmin) - *getmin = in[k]; - if (in[k] > *getmax) - *getmax = in[k]; +static void simdmaxmin_length(const uint32_t *in, uint32_t length, + uint32_t *getmin, uint32_t *getmax) { + uint32_t lengthdividedby4 = length / 4; + uint32_t offset = lengthdividedby4 * 4; + uint32_t k; + *getmin = 0xFFFFFFFF; + *getmax = 0; + if (lengthdividedby4 > 0) { + const __m128i *pin = (const __m128i *)(in); + __m128i minaccumulator = _mm_loadu_si128(pin); + __m128i maxaccumulator = minaccumulator; + uint32_t k = 1; + for (; 4 * k < lengthdividedby4 * 4; ++k) { + __m128i newvec = _mm_loadu_si128(pin + k); + minaccumulator = _mm_min_epu32(minaccumulator, newvec); + maxaccumulator = _mm_max_epu32(maxaccumulator, newvec); } + *getmin = minasint(minaccumulator); + *getmax = maxasint(maxaccumulator); + } + for (k = offset; k < length; ++k) { + if (in[k] < *getmin) + *getmin = in[k]; + if (in[k] > *getmax) + *getmax = in[k]; + } } - - - - -uint32_t * SIMDCompressionLib::SIMDFrameOfReference::simd_compress_length(const uint32_t * in, uint32_t length, uint32_t * out) { - if(length == 0) return out; - uint32_t m, M; - simdmaxmin_length(in, length, &m, &M); - int b = bits(static_cast(M-m)); - out[0] = m; - ++out; - out[0] = M; - ++out; - uint32_t k = 0; - for(; k+128<=length; k+=128,in+=128) { - simdpackFOR(m, in, (__m128i *) out, b); - out += b * 4; - } - if(length != k) out = (uint32_t *) simdpackFOR_length(m, in, length - k , (__m128i *) out,b); - in += length - k; - +uint32_t *SIMDCompressionLib::SIMDFrameOfReference::simd_compress_length( + const uint32_t *in, uint32_t length, uint32_t *out) { + if (length == 0) return out; + uint32_t m, M; + simdmaxmin_length(in, length, &m, &M); + int b = bits(static_cast(M - m)); + out[0] = m; + ++out; + out[0] = M; + ++out; + uint32_t k = 0; + for (; k + 128 <= length; k += 128, in += 128) { + simdpackFOR(m, in, (__m128i *)out, b); + out += b * 4; + } + if (length != k) + out = (uint32_t *)simdpackFOR_length(m, in, length - k, (__m128i *)out, b); + in += length - k; + + return out; } -size_t SIMDCompressionLib::SIMDFrameOfReference::append(uint8_t *inbyte, const size_t currentcompressedsizeinbytes, uint32_t value) { - if(currentcompressedsizeinbytes == 0) { - size_t nvalue = 16;// 16 is arbitrary - encodeArray(&value, 1, (uint32_t *)inbyte,nvalue); - return nvalue * sizeof(uint32_t); - } - uint32_t * in = (uint32_t *) inbyte; - - uint32_t nvalue = *in; - in[0] += 1;// incrementing! - in++; - - uint32_t m = in[0]; - uint32_t M = in[1]; - - int b = bits(static_cast(M-m)); - if ((value < m) || (value > M)) {// should be unlikely - uint32_t newm = m; - uint32_t newM = M; - - if (value < m) { - newm = value; - } - if (value > M) { - newM = value; - } - int newb = bits(static_cast(newM - newm)); - if ((newb != b) || (value < m) ) { // unlucky path - vector < uint32_t > buffer(nvalue + 1); - simd_uncompress_length(in, buffer.data(), nvalue); - buffer[nvalue] = value; - uint32_t * newin = simd_compress_length(buffer.data(), nvalue + 1, in); - return (newin - in + 1) * sizeof(uint32_t); - } else {// only max changed - in[1] = newM; - M = newM;// probably unnecessary - } - } - // add one - in += 2; - uint32_t headersize = 3; - - // Appending values one by one defeats the purpose of - // vectorization which is best for in-bulk processing. - // So we are not going to waste too much time trying to - // optimize the code here. We just try to avoid - // obviously unnecessary processing. - - uint32_t buffer[128]; - size_t oldhowmanyfull = nvalue / 128; - uint32_t * const newin = in + 4 * b * oldhowmanyfull; - size_t need_decoding = nvalue - oldhowmanyfull * 128; - simdunpackFOR_length(m, (__m128i *)newin, static_cast(need_decoding), buffer, b); - - buffer[need_decoding] = value; - - uint32_t * out = (uint32_t *)simdpackFOR_length(m, buffer, static_cast(need_decoding + 1), (__m128i *) newin,b); - - return ( out - in + headersize ) * sizeof(uint32_t); +size_t SIMDCompressionLib::SIMDFrameOfReference::append( + uint8_t *inbyte, const size_t currentcompressedsizeinbytes, + uint32_t value) { + if (currentcompressedsizeinbytes == 0) { + size_t nvalue = 16; // 16 is arbitrary + encodeArray(&value, 1, (uint32_t *)inbyte, nvalue); + return nvalue * sizeof(uint32_t); + } + uint32_t *in = (uint32_t *)inbyte; + + uint32_t nvalue = *in; + in[0] += 1; // incrementing! + in++; + + uint32_t m = in[0]; + uint32_t M = in[1]; + + int b = bits(static_cast(M - m)); + if ((value < m) || (value > M)) { // should be unlikely + uint32_t newm = m; + uint32_t newM = M; + + if (value < m) { + newm = value; + } + if (value > M) { + newM = value; + } + int newb = bits(static_cast(newM - newm)); + if ((newb != b) || (value < m)) { // unlucky path + vector buffer(nvalue + 1); + simd_uncompress_length(in, buffer.data(), nvalue); + buffer[nvalue] = value; + uint32_t *newin = simd_compress_length(buffer.data(), nvalue + 1, in); + return (newin - in + 1) * sizeof(uint32_t); + } else { // only max changed + in[1] = newM; + M = newM; // probably unnecessary + } + } + // add one + in += 2; + uint32_t headersize = 3; + + // Appending values one by one defeats the purpose of + // vectorization which is best for in-bulk processing. + // So we are not going to waste too much time trying to + // optimize the code here. We just try to avoid + // obviously unnecessary processing. + + uint32_t buffer[128]; + size_t oldhowmanyfull = nvalue / 128; + uint32_t *const newin = in + 4 * b * oldhowmanyfull; + size_t need_decoding = nvalue - oldhowmanyfull * 128; + simdunpackFOR_length(m, (__m128i *)newin, static_cast(need_decoding), + buffer, b); + + buffer[need_decoding] = value; + + uint32_t *out = (uint32_t *)simdpackFOR_length( + m, buffer, static_cast(need_decoding + 1), (__m128i *)newin, b); + + return (out - in + headersize) * sizeof(uint32_t); } - /* * Not sure what this was for? * -uint32_t * simd_compress_length_sorted(const uint32_t * in, uint32_t length, uint32_t * out) { +uint32_t * simd_compress_length_sorted(const uint32_t * in, uint32_t length, +uint32_t * out) { if(length == 0) return out; uint32_t m = in[0]; uint32_t M = in[length - 1]; @@ -27231,28 +28038,34 @@ uint32_t * simd_compress_length_sorted(const uint32_t * in, uint32_t length, uin ++out; uint32_t k = 0; for(; k+128<=length; k+=128,in+=128) { - simdpackFOR(m, in, (__m128i *) out, b); - out += b * 4; + simdpackFOR(m, in, (__m128i *) out, b); + out += b * 4; } - if(length != k) out = (uint32_t *) simdpackFOR_length(m, in, length - k , (__m128i *) out,b); + if(length != k) out = (uint32_t *) simdpackFOR_length(m, in, length - k , +(__m128i *) out,b); in += length - k; return out; }*/ -const uint32_t * SIMDCompressionLib::SIMDFrameOfReference::simd_uncompress_length(const uint32_t * in, uint32_t * out, uint32_t nvalue) { - if(nvalue == 0) return out; - uint32_t m = in[0]; - ++in; - uint32_t M = in[0]; - ++in; - int b = bits(static_cast(M-m)); - for(uint32_t k = 0; k(M - m)); + for (uint32_t k = 0; k < nvalue / 128; ++k) { + simdunpackFOR(m, (const __m128i *)(in + 4 * b * k), out + 128 * k, b); + } + out = out + nvalue / 128 * 128; + in = in + nvalue / 128 * 4 * b; + if ((nvalue % 128) != 0) + in = (const uint32_t *)simdunpackFOR_length( + m, (__m128i *)in, nvalue - nvalue / 128 * 128, out, b); + + return in; } diff --git a/src/integratedbitpacking.cpp b/src/integratedbitpacking.cpp index 72d8158..553aae8 100644 --- a/src/integratedbitpacking.cpp +++ b/src/integratedbitpacking.cpp @@ -7,6783 +7,6668 @@ #include "integratedbitpacking.h" - namespace SIMDCompressionLib { -void __integratedfastunpack0(const uint32_t initoffset, const uint32_t *__restrict__ , - uint32_t *__restrict__ out) { - for (uint32_t i = 0; i < 32; ++i) - *(out++) = initoffset; -} -void __integratedfastpack0(const uint32_t, const uint32_t *__restrict__ , uint32_t *__restrict__) { +void __integratedfastunpack0(const uint32_t initoffset, + const uint32_t *__restrict__, + uint32_t *__restrict__ out) { + for (uint32_t i = 0; i < 32; ++i) + *(out++) = initoffset; } +void __integratedfastpack0(const uint32_t, const uint32_t *__restrict__, + uint32_t *__restrict__) {} - -void __integratedfastunpack32(const uint32_t, const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - for (int k = 0 ; k < 32 ; ++k) - out[k] = in[k]; // no sense in wasting time with deltas +void __integratedfastunpack32(const uint32_t, const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; // no sense in wasting time with deltas } -void __integratedfastpack32(const uint32_t, const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { - for (int k = 0 ; k < 32 ; ++k) - out[k] = in[k] ; // no sense in wasting time with deltas +void __integratedfastpack32(const uint32_t, const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; // no sense in wasting time with deltas } - -void __integratedfastunpack2(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 2) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) % (1U << 2) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack2(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 2); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28) % (1U << 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + *out += out[-1]; // integrated delta decoding } - - - -void __integratedfastunpack3(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 3) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 1)) << (3 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 2)) << (3 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) % (1U << 3) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack3(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 3); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 1)) << (3 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 2)) << (3 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26) % (1U << 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + *out += out[-1]; // integrated delta decoding } - - - -void __integratedfastunpack5(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 5) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 3)) << (5 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 1)) << (5 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 4)) << (5 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 2)) << (5 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) % (1U << 5) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack5(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 5); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 3)) << (5 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 1)) << (5 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 4)) << (5 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 2)) << (5 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22) % (1U << 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + *out += out[-1]; // integrated delta decoding } +void __integratedfastunpack6(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 6); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 4)) << (6 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 2)) << (6 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 4)) << (6 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 2)) << (6 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack7(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 7); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 3)) << (7 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 6)) << (7 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 2)) << (7 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 5)) << (7 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 1)) << (7 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 4)) << (7 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + *out += out[-1]; // integrated delta decoding +} - -void __integratedfastunpack6(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 6) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 4)) << (6 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 2)) << (6 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 4)) << (6 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 2)) << (6 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 6) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack9(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 9); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 4)) << (9 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 8)) << (9 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 3)) << (9 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 7)) << (9 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 2)) << (9 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 6)) << (9 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 1)) << (9 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 5)) << (9 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + *out += out[-1]; // integrated delta decoding } +void __integratedfastunpack10(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 10); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 8)) << (10 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 6)) << (10 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 4)) << (10 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 2)) << (10 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 8)) << (10 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 6)) << (10 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 4)) << (10 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 2)) << (10 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack11(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 11); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 1)) << (11 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 2)) << (11 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 3)) << (11 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 4)) << (11 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 5)) << (11 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 6)) << (11 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 7)) << (11 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 8)) << (11 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 9)) << (11 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 10)) << (11 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack12(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 12); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 4)) << (12 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 8)) << (12 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 4)) << (12 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 8)) << (12 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 4)) << (12 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 8)) << (12 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 4)) << (12 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 8)) << (12 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + *out += out[-1]; // integrated delta decoding +} -void __integratedfastunpack7(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 7) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 3)) << (7 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 6)) << (7 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 2)) << (7 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 5)) << (7 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 1)) << (7 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 4)) << (7 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 7) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack13(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 13); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 13) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 7)) << (13 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 1)) << (13 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 8)) << (13 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 2)) << (13 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 9)) << (13 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 3)) << (13 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 10)) << (13 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 4)) << (13 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 11)) << (13 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 5)) << (13 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 12)) << (13 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 6)) << (13 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + *out += out[-1]; // integrated delta decoding } +void __integratedfastunpack14(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 14); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 10)) << (14 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 6)) << (14 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 2)) << (14 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 12)) << (14 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 8)) << (14 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 4)) << (14 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 10)) << (14 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 6)) << (14 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 2)) << (14 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 12)) << (14 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 8)) << (14 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 4)) << (14 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack15(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 15); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 15) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 13)) << (15 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 11)) << (15 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 9)) << (15 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 7)) << (15 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 5)) << (15 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 3)) << (15 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 1)) << (15 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 14)) << (15 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 12)) << (15 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 10)) << (15 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 8)) << (15 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 6)) << (15 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 4)) << (15 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 2)) << (15 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack17(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 17); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 17); + ++in; + *out |= (*in % (1U << 2)) << (17 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 4)) << (17 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 6)) << (17 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 8)) << (17 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 10)) << (17 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 12)) << (17 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 14)) << (17 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 16)) << (17 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 1)) << (17 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 3)) << (17 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 5)) << (17 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 7)) << (17 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 9)) << (17 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 11)) << (17 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 13)) << (17 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13) % (1U << 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 15)) << (17 - 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15); + *out += out[-1]; // integrated delta decoding +} -void __integratedfastunpack9(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 9) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 4)) << (9 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 8)) << (9 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 3)) << (9 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 7)) << (9 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 2)) << (9 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 6)) << (9 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 1)) << (9 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 5)) << (9 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 9) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack18(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 18); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 4)) << (18 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 8)) << (18 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 12)) << (18 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 16)) << (18 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 2)) << (18 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 6)) << (18 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 10)) << (18 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 14)) << (18 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 4)) << (18 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 8)) << (18 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 12)) << (18 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 16)) << (18 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 2)) << (18 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 6)) << (18 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 10)) << (18 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 14)) << (18 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + *out += out[-1]; // integrated delta decoding } +void __integratedfastunpack19(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 19); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 6)) << (19 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 12)) << (19 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 18)) << (19 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 5)) << (19 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 11)) << (19 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 17)) << (19 - 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17); + ++in; + *out |= (*in % (1U << 4)) << (19 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 10)) << (19 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 16)) << (19 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 3)) << (19 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 9)) << (19 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 15)) << (19 - 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15); + ++in; + *out |= (*in % (1U << 2)) << (19 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 8)) << (19 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 14)) << (19 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 1)) << (19 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 7)) << (19 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 13)) << (19 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack20(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 20); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 8)) << (20 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 16)) << (20 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 4)) << (20 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 12)) << (20 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 8)) << (20 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 16)) << (20 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 4)) << (20 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 12)) << (20 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 8)) << (20 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 16)) << (20 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 4)) << (20 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 12)) << (20 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 8)) << (20 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 16)) << (20 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 4)) << (20 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 12)) << (20 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack21(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 21); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 10)) << (21 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 20)) << (21 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 9)) << (21 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 19)) << (21 - 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 8)) << (21 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 18)) << (21 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 7)) << (21 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 17)) << (21 - 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17); + ++in; + *out |= (*in % (1U << 6)) << (21 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 16)) << (21 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 5)) << (21 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 15)) << (21 - 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15); + ++in; + *out |= (*in % (1U << 4)) << (21 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 14)) << (21 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 3)) << (21 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 13)) << (21 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13); + ++in; + *out |= (*in % (1U << 2)) << (21 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 12)) << (21 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 1)) << (21 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 11)) << (21 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11); + *out += out[-1]; // integrated delta decoding +} -void __integratedfastunpack10(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 10) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 8)) << (10 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 6)) << (10 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 4)) << (10 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 2)) << (10 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 8)) << (10 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 6)) << (10 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 4)) << (10 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 2)) << (10 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 10) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack22(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 22); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 12)) << (22 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 2)) << (22 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 14)) << (22 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 4)) << (22 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 16)) << (22 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 6)) << (22 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 18)) << (22 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 8)) << (22 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 20)) << (22 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 10)) << (22 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 12)) << (22 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 2)) << (22 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 14)) << (22 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 4)) << (22 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 16)) << (22 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 6)) << (22 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 18)) << (22 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 8)) << (22 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 20)) << (22 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 10)) << (22 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + *out += out[-1]; // integrated delta decoding } +void __integratedfastunpack23(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 23); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 14)) << (23 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 5)) << (23 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 19)) << (23 - 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 10)) << (23 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 1)) << (23 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 15)) << (23 - 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15); + ++in; + *out |= (*in % (1U << 6)) << (23 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 20)) << (23 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 11)) << (23 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11); + ++in; + *out |= (*in % (1U << 2)) << (23 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 16)) << (23 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 7)) << (23 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7) % (1U << 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 21)) << (23 - 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 12)) << (23 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 3)) << (23 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 17)) << (23 - 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17); + ++in; + *out |= (*in % (1U << 8)) << (23 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8) % (1U << 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 22)) << (23 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 13)) << (23 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13); + ++in; + *out |= (*in % (1U << 4)) << (23 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 18)) << (23 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 9)) << (23 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack24(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 24); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 16)) << (24 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 8)) << (24 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 16)) << (24 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 8)) << (24 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 16)) << (24 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 8)) << (24 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 16)) << (24 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 8)) << (24 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 16)) << (24 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 8)) << (24 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 16)) << (24 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 8)) << (24 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 16)) << (24 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 8)) << (24 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 16)) << (24 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 8)) << (24 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack25(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 25); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 18)) << (25 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 11)) << (25 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11); + ++in; + *out |= (*in % (1U << 4)) << (25 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 22)) << (25 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 15)) << (25 - 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15); + ++in; + *out |= (*in % (1U << 8)) << (25 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 1)) << (25 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 19)) << (25 - 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 12)) << (25 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 5)) << (25 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5) % (1U << 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 23)) << (25 - 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 16)) << (25 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 9)) << (25 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9); + ++in; + *out |= (*in % (1U << 2)) << (25 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 20)) << (25 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 13)) << (25 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13); + ++in; + *out |= (*in % (1U << 6)) << (25 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6) % (1U << 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 24)) << (25 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 17)) << (25 - 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17); + ++in; + *out |= (*in % (1U << 10)) << (25 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 3)) << (25 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 21)) << (25 - 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 14)) << (25 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 7)) << (25 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7); + *out += out[-1]; // integrated delta decoding +} -void __integratedfastunpack11(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 11) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 1)) << (11 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 2)) << (11 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 3)) << (11 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 4)) << (11 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 5)) << (11 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 6)) << (11 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 7)) << (11 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 8)) << (11 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 9)) << (11 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 10)) << (11 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 11) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack26(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 26); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 20)) << (26 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 14)) << (26 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 8)) << (26 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 2)) << (26 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 22)) << (26 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 16)) << (26 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 10)) << (26 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 4)) << (26 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 24)) << (26 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 18)) << (26 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 12)) << (26 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 6)) << (26 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 20)) << (26 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 14)) << (26 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 8)) << (26 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 2)) << (26 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 22)) << (26 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 16)) << (26 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 10)) << (26 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 4)) << (26 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 24)) << (26 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 18)) << (26 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 12)) << (26 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 6)) << (26 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6); + *out += out[-1]; // integrated delta decoding } +void __integratedfastunpack27(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 27); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 22)) << (27 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 17)) << (27 - 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17); + ++in; + *out |= (*in % (1U << 12)) << (27 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 7)) << (27 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7); + ++in; + *out |= (*in % (1U << 2)) << (27 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 27); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 24)) << (27 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 19)) << (27 - 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 14)) << (27 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 9)) << (27 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9); + ++in; + *out |= (*in % (1U << 4)) << (27 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4) % (1U << 27); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 26)) << (27 - 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 21)) << (27 - 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 16)) << (27 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 11)) << (27 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11); + ++in; + *out |= (*in % (1U << 6)) << (27 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6); + ++in; + *out |= (*in % (1U << 1)) << (27 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 27); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 23)) << (27 - 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 18)) << (27 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 13)) << (27 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13); + ++in; + *out |= (*in % (1U << 8)) << (27 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 3)) << (27 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3) % (1U << 27); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 25)) << (27 - 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 20)) << (27 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 15)) << (27 - 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15); + ++in; + *out |= (*in % (1U << 10)) << (27 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 5)) << (27 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack28(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 28); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 24)) << (28 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 20)) << (28 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 16)) << (28 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 12)) << (28 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 8)) << (28 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 4)) << (28 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 28); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 24)) << (28 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 20)) << (28 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 16)) << (28 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 12)) << (28 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 8)) << (28 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 4)) << (28 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 28); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 24)) << (28 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 20)) << (28 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 16)) << (28 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 12)) << (28 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 8)) << (28 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 4)) << (28 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 28); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 24)) << (28 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 20)) << (28 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 16)) << (28 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 12)) << (28 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 8)) << (28 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 4)) << (28 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack29(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 29); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 26)) << (29 - 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 23)) << (29 - 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 20)) << (29 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 17)) << (29 - 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17); + ++in; + *out |= (*in % (1U << 14)) << (29 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 11)) << (29 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11); + ++in; + *out |= (*in % (1U << 8)) << (29 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 5)) << (29 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5); + ++in; + *out |= (*in % (1U << 2)) << (29 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2) % (1U << 29); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 28)) << (29 - 28); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 25)) << (29 - 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 22)) << (29 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 19)) << (29 - 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 16)) << (29 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 13)) << (29 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13); + ++in; + *out |= (*in % (1U << 10)) << (29 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 7)) << (29 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7); + ++in; + *out |= (*in % (1U << 4)) << (29 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4); + ++in; + *out |= (*in % (1U << 1)) << (29 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1) % (1U << 29); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 27)) << (29 - 27); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 24)) << (29 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 21)) << (29 - 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 18)) << (29 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 15)) << (29 - 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15); + ++in; + *out |= (*in % (1U << 12)) << (29 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 9)) << (29 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9); + ++in; + *out |= (*in % (1U << 6)) << (29 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6); + ++in; + *out |= (*in % (1U << 3)) << (29 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3); + *out += out[-1]; // integrated delta decoding +} -void __integratedfastunpack12(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 12) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 4)) << (12 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 8)) << (12 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 4)) << (12 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 8)) << (12 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 4)) << (12 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 8)) << (12 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 4)) << (12 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 8)) << (12 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 12) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack30(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 30); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 28)) << (30 - 28); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 26)) << (30 - 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 24)) << (30 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 22)) << (30 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 20)) << (30 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 18)) << (30 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 16)) << (30 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 14)) << (30 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 12)) << (30 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 10)) << (30 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 8)) << (30 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 6)) << (30 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6); + ++in; + *out |= (*in % (1U << 4)) << (30 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4); + ++in; + *out |= (*in % (1U << 2)) << (30 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2); + ++in; + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 0) % (1U << 30); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 28)) << (30 - 28); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 26)) << (30 - 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 24)) << (30 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 22)) << (30 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 20)) << (30 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 18)) << (30 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 16)) << (30 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 14)) << (30 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 12)) << (30 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 10)) << (30 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 8)) << (30 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 6)) << (30 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6); + ++in; + *out |= (*in % (1U << 4)) << (30 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4); + ++in; + *out |= (*in % (1U << 2)) << (30 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2); + *out += out[-1]; // integrated delta decoding } +void __integratedfastunpack31(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in >> 0) % (1U << 31); + *out += initoffset; // integrated delta decoding + out++; + *out = (*in >> 31); + ++in; + *out |= (*in % (1U << 30)) << (31 - 30); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 30); + ++in; + *out |= (*in % (1U << 29)) << (31 - 29); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 29); + ++in; + *out |= (*in % (1U << 28)) << (31 - 28); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 28); + ++in; + *out |= (*in % (1U << 27)) << (31 - 27); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 27); + ++in; + *out |= (*in % (1U << 26)) << (31 - 26); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 26); + ++in; + *out |= (*in % (1U << 25)) << (31 - 25); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 25); + ++in; + *out |= (*in % (1U << 24)) << (31 - 24); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 24); + ++in; + *out |= (*in % (1U << 23)) << (31 - 23); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 23); + ++in; + *out |= (*in % (1U << 22)) << (31 - 22); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 22); + ++in; + *out |= (*in % (1U << 21)) << (31 - 21); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 21); + ++in; + *out |= (*in % (1U << 20)) << (31 - 20); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 20); + ++in; + *out |= (*in % (1U << 19)) << (31 - 19); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 19); + ++in; + *out |= (*in % (1U << 18)) << (31 - 18); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 18); + ++in; + *out |= (*in % (1U << 17)) << (31 - 17); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 17); + ++in; + *out |= (*in % (1U << 16)) << (31 - 16); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 16); + ++in; + *out |= (*in % (1U << 15)) << (31 - 15); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 15); + ++in; + *out |= (*in % (1U << 14)) << (31 - 14); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 14); + ++in; + *out |= (*in % (1U << 13)) << (31 - 13); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 13); + ++in; + *out |= (*in % (1U << 12)) << (31 - 12); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 12); + ++in; + *out |= (*in % (1U << 11)) << (31 - 11); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 11); + ++in; + *out |= (*in % (1U << 10)) << (31 - 10); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 10); + ++in; + *out |= (*in % (1U << 9)) << (31 - 9); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 9); + ++in; + *out |= (*in % (1U << 8)) << (31 - 8); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 8); + ++in; + *out |= (*in % (1U << 7)) << (31 - 7); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 7); + ++in; + *out |= (*in % (1U << 6)) << (31 - 6); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 6); + ++in; + *out |= (*in % (1U << 5)) << (31 - 5); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 5); + ++in; + *out |= (*in % (1U << 4)) << (31 - 4); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 4); + ++in; + *out |= (*in % (1U << 3)) << (31 - 3); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 3); + ++in; + *out |= (*in % (1U << 2)) << (31 - 2); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 2); + ++in; + *out |= (*in % (1U << 1)) << (31 - 1); + *out += out[-1]; // integrated delta decoding + out++; + *out = (*in >> 1); + *out += out[-1]; // integrated delta decoding +} +void __integratedfastunpack1(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in & 1) + initoffset; + ++out; + *out = ((*in >> 1) & 1) + out[-1]; + ++out; + for (uint32_t i = 2; i < 32; i += 1) { + *out = ((*in >> i) & 1) + out[-1]; + ++i; + ++out; + *out = ((*in >> i) & 1) + out[-1]; + ++out; + } +} +void __integratedfastunpack4(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *(out++) = (*in % (1U << 4)) + initoffset; + for (uint32_t i = 4; i < 32; i += 4) { + *out = ((*in >> i) % (1U << 4)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 4) { + *out = ((*in >> i) % (1U << 4)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 4) { + *out = ((*in >> i) % (1U << 4)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 4) { + *out = ((*in >> i) % (1U << 4)) + out[-1]; + ++out; + } +} -void __integratedfastunpack13(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 13) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 13) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 7)) << (13 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 1)) << (13 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 8)) << (13 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 2)) << (13 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 9)) << (13 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 3)) << (13 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 10)) << (13 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 4)) << (13 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 11)) << (13 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 5)) << (13 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 12)) << (13 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 6)) << (13 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 13) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastunpack8(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *(out++) = (*in % (1U << 8)) + initoffset; + for (uint32_t i = 8; i < 32; i += 8) { + *out = ((*in >> i) % (1U << 8)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 8) { + *out = ((*in >> i) % (1U << 8)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 8) { + *out = ((*in >> i) % (1U << 8)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 8) { + *out = ((*in >> i) % (1U << 8)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 8) { + *out = ((*in >> i) % (1U << 8)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 8) { + *out = ((*in >> i) % (1U << 8)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 8) { + *out = ((*in >> i) % (1U << 8)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 8) { + *out = ((*in >> i) % (1U << 8)) + out[-1]; + ++out; + } } +void __integratedfastunpack16(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *(out++) = (*in % (1U << 16)) + initoffset; + for (uint32_t i = 16; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } + ++in; + for (uint32_t i = 0; i < 32; i += 16) { + *out = ((*in >> i) % (1U << 16)) + out[-1]; + ++out; + } +} +void __integratedfastpack2(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 12; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 14; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 16; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 18; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 20; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 22; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 24; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 26; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 28; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 12; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 14; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 16; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 18; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 20; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 22; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 24; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 26; + ++in; + *out |= ((*in - in[-1]) % (1U << 2)) << 28; + ++in; + *out |= ((*in - in[-1])) << 30; +} +void __integratedfastpack3(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 3; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 9; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 12; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 15; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 18; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 21; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 24; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 27; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 3)) >> (3 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 1; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 7; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 13; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 16; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 19; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 22; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 25; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 28; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 3)) >> (3 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 5; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 11; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 14; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 17; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 20; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 23; + ++in; + *out |= ((*in - in[-1]) % (1U << 3)) << 26; + ++in; + *out |= ((*in - in[-1])) << 29; +} -void __integratedfastunpack14(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 14) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 10)) << (14 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 6)) << (14 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 2)) << (14 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 12)) << (14 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 8)) << (14 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 4)) << (14 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 10)) << (14 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 6)) << (14 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 2)) << (14 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 12)) << (14 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 8)) << (14 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 4)) << (14 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 14) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastpack5(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 5; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 15; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 20; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 25; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 5)) >> (5 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 3; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 13; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 18; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 23; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 5)) >> (5 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 1; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 11; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 16; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 21; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 26; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 5)) >> (5 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 9; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 14; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 19; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 24; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 5)) >> (5 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 7; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 12; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 17; + ++in; + *out |= ((*in - in[-1]) % (1U << 5)) << 22; + ++in; + *out |= ((*in - in[-1])) << 27; } +void __integratedfastpack6(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 12; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 18; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 24; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 6)) >> (6 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 16; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 22; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 6)) >> (6 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 14; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 20; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 12; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 18; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 24; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 6)) >> (6 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 16; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 22; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 6)) >> (6 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 14; + ++in; + *out |= ((*in - in[-1]) % (1U << 6)) << 20; + ++in; + *out |= ((*in - in[-1])) << 26; +} +void __integratedfastpack7(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 7; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 14; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 21; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 3; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 17; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 24; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 13; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 20; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 9; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 16; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 23; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 5; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 12; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 19; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 1; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 15; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 22; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 11; + ++in; + *out |= ((*in - in[-1]) % (1U << 7)) << 18; + ++in; + *out |= ((*in - in[-1])) << 25; +} +void __integratedfastpack9(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 9); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 9; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 18; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 13; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 22; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 17; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 3; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 12; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 21; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 7; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 16; + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 11; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 20; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 15; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 1; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 19; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 5; + ++in; + *out |= ((*in - in[-1]) % (1U << 9)) << 14; + ++in; + *out |= ((*in - in[-1])) << 23; +} -void __integratedfastunpack15(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 15) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 15) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 13)) << (15 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 11)) << (15 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 9)) << (15 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 7)) << (15 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 5)) << (15 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 3)) << (15 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 1)) << (15 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 14)) << (15 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 12)) << (15 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 10)) << (15 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 8)) << (15 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 6)) << (15 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 4)) << (15 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 2)) << (15 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 15) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastpack10(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 20; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 18; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 16; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 14; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 12; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 10; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 20; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 18; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 16; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 14; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 10)) << 12; + ++in; + *out |= ((*in - in[-1])) << 22; } +void __integratedfastpack11(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 11); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 11; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 1; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 12; + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 13; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 3; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 14; + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 15; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 5; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 16; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 6; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 17; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 7; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 18; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 8; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 19; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 9); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 9; + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 20; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 11)) << 10; + ++in; + *out |= ((*in - in[-1])) << 21; +} +void __integratedfastpack12(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 12; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 16; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 8; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 12; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 16; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 8; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 12; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 16; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 8; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 12; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 16; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 12)) << 8; + ++in; + *out |= ((*in - in[-1])) << 20; +} +void __integratedfastpack13(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 13); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 13; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 7; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 1; + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 14; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 8; + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 15; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 9); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 9; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 3; + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 16; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 10; + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 4; + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 17; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 11); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 11; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 5; + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 18; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 12; + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 13)) << 6; + ++in; + *out |= ((*in - in[-1])) << 19; +} -void __integratedfastunpack17(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 17) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - ++in; - *out |= (*in % (1U << 2)) << (17 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 4)) << (17 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 6)) << (17 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 8)) << (17 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 10)) << (17 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 12)) << (17 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 14)) << (17 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 16)) << (17 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 1)) << (17 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 3)) << (17 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 5)) << (17 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 7)) << (17 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 9)) << (17 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 11)) << (17 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 13)) << (17 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) % (1U << 17) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 15)) << (17 - 15); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastpack14(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 14); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 14; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 10; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 6; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 16; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 12; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 8; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 4; + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 14); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 14; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 10; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 6; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 2; + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 16; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 12; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 8; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 14)) << 4; + ++in; + *out |= ((*in - in[-1])) << 18; } +void __integratedfastpack15(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 15); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 15; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 13); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 13; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 11); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 11; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 9); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 9; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 7; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 5; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 3; + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 1; + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 16; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 14); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 14; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 12; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 10; + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 8; + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 6; + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 4; + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 15)) << 2; + ++in; + *out |= ((*in - in[-1])) << 17; +} +void __integratedfastpack17(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 17); + ++in; + *out |= ((*in - in[-1])) << 17; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 2; + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 4; + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 6; + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 8; + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 10; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 12; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 14); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 14; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 1; + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 3; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 5; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 7; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 9); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 9; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 11); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 11; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 13); + ++in; + *out |= ((*in - in[-1]) % (1U << 17)) << 13; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 15); + ++in; + *out |= ((*in - in[-1])) << 15; +} +void __integratedfastpack18(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 4; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 8; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 12; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 2; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 6; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 10; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 4; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 8; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 12; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 2; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 6; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 18)) << 10; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; +} -void __integratedfastunpack18(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 18) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 4)) << (18 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 8)) << (18 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 12)) << (18 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 16)) << (18 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 2)) << (18 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 6)) << (18 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 10)) << (18 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 14)) << (18 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 4)) << (18 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 8)) << (18 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 12)) << (18 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 16)) << (18 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 2)) << (18 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 6)) << (18 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 10)) << (18 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 18) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 14)) << (18 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastpack19(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 19); + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 6; + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 12); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 12; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 5; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 11); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 11; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 17); + ++in; + *out |= ((*in - in[-1])) << 17; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 4; + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 10; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 3; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 9); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 9; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 15); + ++in; + *out |= ((*in - in[-1])) << 15; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 2; + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 8; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 1; + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 19)) << 7; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 13); + ++in; + *out |= ((*in - in[-1])) << 13; } +void __integratedfastpack20(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 20)) << 8; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 20)) << 4; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 20)) << 8; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 20)) << 4; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 20)) << 8; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 20)) << 4; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 20)) << 8; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 20)) << 4; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; +} +void __integratedfastpack21(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 21); + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 10); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 10; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 9); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 9; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 19); + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 8; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 7; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 17); + ++in; + *out |= ((*in - in[-1])) << 17; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 6; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 5; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 15); + ++in; + *out |= ((*in - in[-1])) << 15; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 4; + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 3; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 13); + ++in; + *out |= ((*in - in[-1])) << 13; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 2; + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 21)) << 1; + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 11); + ++in; + *out |= ((*in - in[-1])) << 11; +} +void __integratedfastpack22(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 22)) << 2; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 22)) << 4; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 22)) << 6; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 22)) << 8; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 22)) << 2; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 22)) << 4; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 22)) << 6; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 22)) << 8; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; +} -void __integratedfastunpack19(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 19) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 6)) << (19 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 12)) << (19 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 18)) << (19 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 5)) << (19 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 11)) << (19 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 17)) << (19 - 17); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - ++in; - *out |= (*in % (1U << 4)) << (19 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 10)) << (19 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 16)) << (19 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 3)) << (19 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 9)) << (19 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 15)) << (19 - 15); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) ; - ++in; - *out |= (*in % (1U << 2)) << (19 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 8)) << (19 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 14)) << (19 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 1)) << (19 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 7)) << (19 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 19) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 13)) << (19 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastpack23(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 23); + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 23)) << 5; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 19); + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 23)) << 1; + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 15); + ++in; + *out |= ((*in - in[-1])) << 15; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 23)) << 6; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 11); + ++in; + *out |= ((*in - in[-1])) << 11; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 23)) << 2; + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 7); + ++in; + *out |= ((*in - in[-1]) % (1U << 23)) << 7; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 21); + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 23)) << 3; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 17); + ++in; + *out |= ((*in - in[-1])) << 17; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 8); + ++in; + *out |= ((*in - in[-1]) % (1U << 23)) << 8; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 13); + ++in; + *out |= ((*in - in[-1])) << 13; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 23)) << 4; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 9); + ++in; + *out |= ((*in - in[-1])) << 9; } +void __integratedfastpack24(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; +} +void __integratedfastpack25(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 25); + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 11); + ++in; + *out |= ((*in - in[-1])) << 11; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 25)) << 4; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 15); + ++in; + *out |= ((*in - in[-1])) << 15; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 25)) << 1; + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 19); + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 5); + ++in; + *out |= ((*in - in[-1]) % (1U << 25)) << 5; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 23); + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 9); + ++in; + *out |= ((*in - in[-1])) << 9; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 25)) << 2; + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 13); + ++in; + *out |= ((*in - in[-1])) << 13; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 6); + ++in; + *out |= ((*in - in[-1]) % (1U << 25)) << 6; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 17); + ++in; + *out |= ((*in - in[-1])) << 17; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 25)) << 3; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 21); + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 7); + ++in; + *out |= ((*in - in[-1])) << 7; +} +void __integratedfastpack26(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 26); + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 26)) << 2; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 26)) << 4; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 6); + ++in; + *out |= ((*in - in[-1])) << 6; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 26); + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 26)) << 2; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 26)) << 4; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 6); + ++in; + *out |= ((*in - in[-1])) << 6; +} -void __integratedfastunpack20(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 20) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 8)) << (20 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 16)) << (20 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 4)) << (20 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 12)) << (20 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 8)) << (20 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 16)) << (20 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 4)) << (20 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 12)) << (20 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 8)) << (20 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 16)) << (20 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 4)) << (20 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 12)) << (20 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 8)) << (20 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 16)) << (20 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 4)) << (20 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 20) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 12)) << (20 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastpack27(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 27); + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 17); + ++in; + *out |= ((*in - in[-1])) << 17; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 7); + ++in; + *out |= ((*in - in[-1])) << 7; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 27)) << 2; + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 19); + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 9); + ++in; + *out |= ((*in - in[-1])) << 9; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 4); + ++in; + *out |= ((*in - in[-1]) % (1U << 27)) << 4; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 26); + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 21); + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 11); + ++in; + *out |= ((*in - in[-1])) << 11; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 6); + ++in; + *out |= ((*in - in[-1])) << 6; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 27)) << 1; + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 23); + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 13); + ++in; + *out |= ((*in - in[-1])) << 13; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 3); + ++in; + *out |= ((*in - in[-1]) % (1U << 27)) << 3; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 25); + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 15); + ++in; + *out |= ((*in - in[-1])) << 15; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 5); + ++in; + *out |= ((*in - in[-1])) << 5; } +void __integratedfastpack28(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 28); + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 4); + ++in; + *out |= ((*in - in[-1])) << 4; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 28); + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 4); + ++in; + *out |= ((*in - in[-1])) << 4; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 28); + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 4); + ++in; + *out |= ((*in - in[-1])) << 4; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 28); + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 4); + ++in; + *out |= ((*in - in[-1])) << 4; +} +void __integratedfastpack29(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 29); + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 26); + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 23); + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 17); + ++in; + *out |= ((*in - in[-1])) << 17; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 11); + ++in; + *out |= ((*in - in[-1])) << 11; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 5); + ++in; + *out |= ((*in - in[-1])) << 5; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 2); + ++in; + *out |= ((*in - in[-1]) % (1U << 29)) << 2; + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 28); + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 25); + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 19); + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 13); + ++in; + *out |= ((*in - in[-1])) << 13; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 7); + ++in; + *out |= ((*in - in[-1])) << 7; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 4); + ++in; + *out |= ((*in - in[-1])) << 4; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 1); + ++in; + *out |= ((*in - in[-1]) % (1U << 29)) << 1; + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 27); + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 21); + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 15); + ++in; + *out |= ((*in - in[-1])) << 15; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 9); + ++in; + *out |= ((*in - in[-1])) << 9; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 6); + ++in; + *out |= ((*in - in[-1])) << 6; + ++out; + *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 3); + ++in; + *out |= ((*in - in[-1])) << 3; +} +void __integratedfastpack30(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 30); + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 28); + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 26); + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 6); + ++in; + *out |= ((*in - in[-1])) << 6; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 4); + ++in; + *out |= ((*in - in[-1])) << 4; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 2); + ++in; + *out |= ((*in - in[-1])) << 2; + ++out; + ++in; + *out = (*in - in[-1]) % (1U << 30); + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 28); + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 26); + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 6); + ++in; + *out |= ((*in - in[-1])) << 6; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 4); + ++in; + *out |= ((*in - in[-1])) << 4; + ++out; + *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 2); + ++in; + *out |= ((*in - in[-1])) << 2; +} -void __integratedfastunpack21(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 21) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 10)) << (21 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 20)) << (21 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 9)) << (21 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 19)) << (21 - 19); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 8)) << (21 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 18)) << (21 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 7)) << (21 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 17)) << (21 - 17); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - ++in; - *out |= (*in % (1U << 6)) << (21 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 16)) << (21 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 5)) << (21 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 15)) << (21 - 15); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) ; - ++in; - *out |= (*in % (1U << 4)) << (21 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 14)) << (21 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 3)) << (21 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 13)) << (21 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) ; - ++in; - *out |= (*in % (1U << 2)) << (21 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 12)) << (21 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 1)) << (21 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 21) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 11)) << (21 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) ; - *out += out[-1] ; // integrated delta decoding +void __integratedfastpack31(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = (*in - initoffset) % (1U << 31); + ++in; + *out |= ((*in - in[-1])) << 31; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 30); + ++in; + *out |= ((*in - in[-1])) << 30; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 29); + ++in; + *out |= ((*in - in[-1])) << 29; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 28); + ++in; + *out |= ((*in - in[-1])) << 28; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 27); + ++in; + *out |= ((*in - in[-1])) << 27; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 26); + ++in; + *out |= ((*in - in[-1])) << 26; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 25); + ++in; + *out |= ((*in - in[-1])) << 25; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 24); + ++in; + *out |= ((*in - in[-1])) << 24; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 23); + ++in; + *out |= ((*in - in[-1])) << 23; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 22); + ++in; + *out |= ((*in - in[-1])) << 22; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 21); + ++in; + *out |= ((*in - in[-1])) << 21; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 20); + ++in; + *out |= ((*in - in[-1])) << 20; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 19); + ++in; + *out |= ((*in - in[-1])) << 19; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 18); + ++in; + *out |= ((*in - in[-1])) << 18; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 17); + ++in; + *out |= ((*in - in[-1])) << 17; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 16); + ++in; + *out |= ((*in - in[-1])) << 16; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 15); + ++in; + *out |= ((*in - in[-1])) << 15; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 14); + ++in; + *out |= ((*in - in[-1])) << 14; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 13); + ++in; + *out |= ((*in - in[-1])) << 13; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 12); + ++in; + *out |= ((*in - in[-1])) << 12; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 11); + ++in; + *out |= ((*in - in[-1])) << 11; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 10); + ++in; + *out |= ((*in - in[-1])) << 10; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 9); + ++in; + *out |= ((*in - in[-1])) << 9; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 8); + ++in; + *out |= ((*in - in[-1])) << 8; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 7); + ++in; + *out |= ((*in - in[-1])) << 7; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 6); + ++in; + *out |= ((*in - in[-1])) << 6; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 5); + ++in; + *out |= ((*in - in[-1])) << 5; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 4); + ++in; + *out |= ((*in - in[-1])) << 4; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 3); + ++in; + *out |= ((*in - in[-1])) << 3; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 2); + ++in; + *out |= ((*in - in[-1])) << 2; + ++out; + *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 1); + ++in; + *out |= ((*in - in[-1])) << 1; } +/*assumes that integers fit in the prescribed number of bits*/ +void __integratedfastpack1(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = *(in++) - initoffset; + for (uint32_t i = 1; i < 32; i += 1) { + *out |= (*in - in[-1]) << i; + ++in; + } +} +/*assumes that integers fit in the prescribed number of bits*/ +void __integratedfastpack4(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = *(in++) - initoffset; + for (uint32_t i = 4; i < 32; i += 4) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 4; i < 32; i += 4) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 4; i < 32; i += 4) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 4; i < 32; i += 4) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; +} - -void __integratedfastunpack22(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 22) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 12)) << (22 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 2)) << (22 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 14)) << (22 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 4)) << (22 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 16)) << (22 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 6)) << (22 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 18)) << (22 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 8)) << (22 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 20)) << (22 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 10)) << (22 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 12)) << (22 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 2)) << (22 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 14)) << (22 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 4)) << (22 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 16)) << (22 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 6)) << (22 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 18)) << (22 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 8)) << (22 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 22) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 20)) << (22 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 10)) << (22 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack23(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 23) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 14)) << (23 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 5)) << (23 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 23) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 19)) << (23 - 19); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 10)) << (23 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 1)) << (23 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 23) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 15)) << (23 - 15); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) ; - ++in; - *out |= (*in % (1U << 6)) << (23 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 23) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 20)) << (23 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 11)) << (23 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) ; - ++in; - *out |= (*in % (1U << 2)) << (23 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 23) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 16)) << (23 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 7)) << (23 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) % (1U << 23) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 21)) << (23 - 21); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 12)) << (23 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 3)) << (23 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 23) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 17)) << (23 - 17); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - ++in; - *out |= (*in % (1U << 8)) << (23 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) % (1U << 23) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 22)) << (23 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 13)) << (23 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) ; - ++in; - *out |= (*in % (1U << 4)) << (23 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 23) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 18)) << (23 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 9)) << (23 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack24(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 24) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 16)) << (24 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 8)) << (24 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 24) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 16)) << (24 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 8)) << (24 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 24) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 16)) << (24 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 8)) << (24 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 24) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 16)) << (24 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 8)) << (24 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 24) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 16)) << (24 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 8)) << (24 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 24) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 16)) << (24 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 8)) << (24 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 24) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 16)) << (24 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 8)) << (24 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 24) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 16)) << (24 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 8)) << (24 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack25(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 25) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 18)) << (25 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 11)) << (25 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) ; - ++in; - *out |= (*in % (1U << 4)) << (25 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 25) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 22)) << (25 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 15)) << (25 - 15); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) ; - ++in; - *out |= (*in % (1U << 8)) << (25 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 1)) << (25 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 25) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 19)) << (25 - 19); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 12)) << (25 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 5)) << (25 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) % (1U << 25) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 23)) << (25 - 23); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 16)) << (25 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 9)) << (25 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) ; - ++in; - *out |= (*in % (1U << 2)) << (25 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 25) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 20)) << (25 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 13)) << (25 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) ; - ++in; - *out |= (*in % (1U << 6)) << (25 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) % (1U << 25) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 24)) << (25 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 17)) << (25 - 17); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - ++in; - *out |= (*in % (1U << 10)) << (25 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 3)) << (25 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 25) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 21)) << (25 - 21); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 14)) << (25 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 7)) << (25 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack26(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 26) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 20)) << (26 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 14)) << (26 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 8)) << (26 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 2)) << (26 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 26) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 22)) << (26 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 16)) << (26 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 10)) << (26 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 4)) << (26 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 26) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 24)) << (26 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 18)) << (26 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 12)) << (26 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 6)) << (26 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 26) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 20)) << (26 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 14)) << (26 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 8)) << (26 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 2)) << (26 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 26) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 22)) << (26 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 16)) << (26 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 10)) << (26 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 4)) << (26 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 26) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 24)) << (26 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 18)) << (26 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 12)) << (26 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 6)) << (26 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack27(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 27) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 22)) << (27 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 17)) << (27 - 17); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - ++in; - *out |= (*in % (1U << 12)) << (27 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 7)) << (27 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) ; - ++in; - *out |= (*in % (1U << 2)) << (27 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 27) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 24)) << (27 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 19)) << (27 - 19); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 14)) << (27 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 9)) << (27 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) ; - ++in; - *out |= (*in % (1U << 4)) << (27 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) % (1U << 27) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 26)) << (27 - 26); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 21)) << (27 - 21); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 16)) << (27 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 11)) << (27 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) ; - ++in; - *out |= (*in % (1U << 6)) << (27 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) ; - ++in; - *out |= (*in % (1U << 1)) << (27 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 27) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 23)) << (27 - 23); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 18)) << (27 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 13)) << (27 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) ; - ++in; - *out |= (*in % (1U << 8)) << (27 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 3)) << (27 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) % (1U << 27) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 25)) << (27 - 25); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 20)) << (27 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 15)) << (27 - 15); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) ; - ++in; - *out |= (*in % (1U << 10)) << (27 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 5)) << (27 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack28(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 28) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 24)) << (28 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 20)) << (28 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 16)) << (28 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 12)) << (28 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 8)) << (28 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 4)) << (28 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 28) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 24)) << (28 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 20)) << (28 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 16)) << (28 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 12)) << (28 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 8)) << (28 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 4)) << (28 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 28) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 24)) << (28 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 20)) << (28 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 16)) << (28 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 12)) << (28 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 8)) << (28 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 4)) << (28 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 28) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 24)) << (28 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 20)) << (28 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 16)) << (28 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 12)) << (28 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 8)) << (28 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 4)) << (28 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack29(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 29) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 26)) << (29 - 26); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 23)) << (29 - 23); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 20)) << (29 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 17)) << (29 - 17); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - ++in; - *out |= (*in % (1U << 14)) << (29 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 11)) << (29 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) ; - ++in; - *out |= (*in % (1U << 8)) << (29 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 5)) << (29 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) ; - ++in; - *out |= (*in % (1U << 2)) << (29 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) % (1U << 29) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 28)) << (29 - 28); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 25)) << (29 - 25); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 22)) << (29 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 19)) << (29 - 19); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 16)) << (29 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 13)) << (29 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) ; - ++in; - *out |= (*in % (1U << 10)) << (29 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 7)) << (29 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) ; - ++in; - *out |= (*in % (1U << 4)) << (29 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) ; - ++in; - *out |= (*in % (1U << 1)) << (29 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) % (1U << 29) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 27)) << (29 - 27); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 24)) << (29 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 21)) << (29 - 21); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 18)) << (29 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 15)) << (29 - 15); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) ; - ++in; - *out |= (*in % (1U << 12)) << (29 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 9)) << (29 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) ; - ++in; - *out |= (*in % (1U << 6)) << (29 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) ; - ++in; - *out |= (*in % (1U << 3)) << (29 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack30(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 30) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 28)) << (30 - 28); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 26)) << (30 - 26); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 24)) << (30 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 22)) << (30 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 20)) << (30 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 18)) << (30 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 16)) << (30 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 14)) << (30 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 12)) << (30 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 10)) << (30 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 8)) << (30 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 6)) << (30 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) ; - ++in; - *out |= (*in % (1U << 4)) << (30 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) ; - ++in; - *out |= (*in % (1U << 2)) << (30 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) ; - ++in; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 0) % (1U << 30) ; - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 28)) << (30 - 28); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 26)) << (30 - 26); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 24)) << (30 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 22)) << (30 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 20)) << (30 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 18)) << (30 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 16)) << (30 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 14)) << (30 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 12)) << (30 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 10)) << (30 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 8)) << (30 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 6)) << (30 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) ; - ++in; - *out |= (*in % (1U << 4)) << (30 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) ; - ++in; - *out |= (*in % (1U << 2)) << (30 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack31(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in >> 0) % (1U << 31) ; - *out += initoffset ; // integrated delta decoding - out++; - *out = (*in >> 31) ; - ++in; - *out |= (*in % (1U << 30)) << (31 - 30); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 30) ; - ++in; - *out |= (*in % (1U << 29)) << (31 - 29); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 29) ; - ++in; - *out |= (*in % (1U << 28)) << (31 - 28); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 28) ; - ++in; - *out |= (*in % (1U << 27)) << (31 - 27); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 27) ; - ++in; - *out |= (*in % (1U << 26)) << (31 - 26); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 26) ; - ++in; - *out |= (*in % (1U << 25)) << (31 - 25); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 25) ; - ++in; - *out |= (*in % (1U << 24)) << (31 - 24); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 24) ; - ++in; - *out |= (*in % (1U << 23)) << (31 - 23); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 23) ; - ++in; - *out |= (*in % (1U << 22)) << (31 - 22); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 22) ; - ++in; - *out |= (*in % (1U << 21)) << (31 - 21); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 21) ; - ++in; - *out |= (*in % (1U << 20)) << (31 - 20); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 20) ; - ++in; - *out |= (*in % (1U << 19)) << (31 - 19); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 19) ; - ++in; - *out |= (*in % (1U << 18)) << (31 - 18); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 18) ; - ++in; - *out |= (*in % (1U << 17)) << (31 - 17); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 17) ; - ++in; - *out |= (*in % (1U << 16)) << (31 - 16); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 16) ; - ++in; - *out |= (*in % (1U << 15)) << (31 - 15); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 15) ; - ++in; - *out |= (*in % (1U << 14)) << (31 - 14); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 14) ; - ++in; - *out |= (*in % (1U << 13)) << (31 - 13); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 13) ; - ++in; - *out |= (*in % (1U << 12)) << (31 - 12); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 12) ; - ++in; - *out |= (*in % (1U << 11)) << (31 - 11); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 11) ; - ++in; - *out |= (*in % (1U << 10)) << (31 - 10); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 10) ; - ++in; - *out |= (*in % (1U << 9)) << (31 - 9); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 9) ; - ++in; - *out |= (*in % (1U << 8)) << (31 - 8); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 8) ; - ++in; - *out |= (*in % (1U << 7)) << (31 - 7); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 7) ; - ++in; - *out |= (*in % (1U << 6)) << (31 - 6); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 6) ; - ++in; - *out |= (*in % (1U << 5)) << (31 - 5); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 5) ; - ++in; - *out |= (*in % (1U << 4)) << (31 - 4); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 4) ; - ++in; - *out |= (*in % (1U << 3)) << (31 - 3); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 3) ; - ++in; - *out |= (*in % (1U << 2)) << (31 - 2); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 2) ; - ++in; - *out |= (*in % (1U << 1)) << (31 - 1); - *out += out[-1] ; // integrated delta decoding - out++; - *out = (*in >> 1) ; - *out += out[-1] ; // integrated delta decoding -} - - - - -void __integratedfastunpack1(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in & 1) + initoffset; - ++out; - *out = ((*in >> 1) & 1) + out[-1]; - ++out; - for (uint32_t i = 2 ; i < 32; i += 1) { - *out = ((*in >> i) & 1) + out[-1]; - ++i; - ++out; - *out = ((*in >> i) & 1) + out[-1]; - ++out; - } -} - - -void __integratedfastunpack4(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *(out++) = (*in % (1U << 4)) + initoffset; - for (uint32_t i = 4 ; i < 32; i += 4) { - *out = ((*in >> i) % (1U << 4)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 4) { - *out = ((*in >> i) % (1U << 4)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 4) { - *out = ((*in >> i) % (1U << 4)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 4) { - *out = ((*in >> i) % (1U << 4)) + out[-1]; - ++out; - } -} - - - - -void __integratedfastunpack8(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *(out++) = (*in % (1U << 8)) + initoffset; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out = ((*in >> i) % (1U << 8)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 8) { - *out = ((*in >> i) % (1U << 8)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 8) { - *out = ((*in >> i) % (1U << 8)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 8) { - *out = ((*in >> i) % (1U << 8)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 8) { - *out = ((*in >> i) % (1U << 8)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 8) { - *out = ((*in >> i) % (1U << 8)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 8) { - *out = ((*in >> i) % (1U << 8)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 8) { - *out = ((*in >> i) % (1U << 8)) + out[-1]; - ++out; - } -} - - - - -void __integratedfastunpack16(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *(out++) = (*in % (1U << 16)) + initoffset; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } - ++in; - for (uint32_t i = 0; i < 32; i += 16) { - *out = ((*in >> i) % (1U << 16)) + out[-1]; - ++out; - } -} - - - - -void __integratedfastpack2(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 2) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 12 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 14 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 16 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 18 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 20 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 22 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 24 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 26 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 28 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 2) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 12 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 14 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 16 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 18 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 20 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 22 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 24 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 26 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 2)) << 28 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; -} - - - - -void __integratedfastpack3(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 3) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 3 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 9 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 12 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 15 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 18 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 21 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 24 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 27 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 3)) >> (3 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 1 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 7 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 13 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 16 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 19 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 22 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 25 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 28 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 3)) >> (3 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 5 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 11 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 14 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 17 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 20 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 23 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 3)) << 26 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; -} - - - - -void __integratedfastpack5(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 5) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 5 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 15 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 20 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 25 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 5)) >> (5 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 3 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 13 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 18 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 23 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 5)) >> (5 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 1 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 11 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 16 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 21 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 26 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 5)) >> (5 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 9 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 14 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 19 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 24 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 5)) >> (5 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 7 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 12 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 17 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 5)) << 22 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; -} - - - - -void __integratedfastpack6(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 6) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 12 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 18 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 24 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 6)) >> (6 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 16 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 22 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 6)) >> (6 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 14 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 20 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 6) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 12 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 18 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 24 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 6)) >> (6 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 16 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 22 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 6)) >> (6 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 14 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 6)) << 20 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; -} - - - - -void __integratedfastpack7(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 7) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 7 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 14 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 21 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 3 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 17 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 24 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 13 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 20 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 9 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 16 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 23 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 5 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 12 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 19 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 1 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 15 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 22 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 7)) >> (7 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 11 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 7)) << 18 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; -} - - - - -void __integratedfastpack9(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 9) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 9 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 18 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 13 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 22 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 17 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 3 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 12 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 21 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 7); - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 7 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 11 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 20 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 15 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 1 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 19 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 9)) >> (9 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 5 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 9)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 23 ; -} - - - - -void __integratedfastpack10(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 10) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 20 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 18 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 10) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 10 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 20 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 18 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 10)) >> (10 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 10)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; -} - - - - -void __integratedfastpack11(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 11) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 11 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 1 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 13 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 3 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 15 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 5 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 6 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 17 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 7); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 7 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 18 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 8 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 19 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 9); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 9 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 20 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 11)) >> (11 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 11)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 21 ; -} - - - - -void __integratedfastpack12(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 12) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 12) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 12) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 12) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 12)) >> (12 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 12)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; -} - - - - -void __integratedfastpack13(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 13) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 13 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 7); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 7 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 1 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 15 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 9); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 9 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 3 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 4 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 17 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 11); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 11 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 5 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 18 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 12); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 13)) >> (13 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 13)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 19 ; -} - - - - -void __integratedfastpack14(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 14) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 12); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 14) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 2 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 12); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 14)) >> (14 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 14)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 18 ; -} - - - - -void __integratedfastpack15(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 15) ; - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 15 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 13); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 13 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 11); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 11 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 9); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 9 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 7); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 7 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 5 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 3 ; - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 1 ; - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 16 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 14); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 12); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 15)) >> (15 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 15)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 17 ; -} - - - - -void __integratedfastpack17(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 17) ; - ++in; - *out |= ((*in - in[-1])) << 17 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 12); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 14); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 14 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 1 ; - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 3 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 5 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 7); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 7 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 9); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 9 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 11); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 11 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 13); - ++in; - *out |= ((*in - in[-1]) % (1U << 17)) << 13 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 17)) >> (17 - 15); - ++in; - *out |= ((*in - in[-1])) << 15 ; -} - - - - -void __integratedfastpack18(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 18) ; - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 12); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 18) ; - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 12); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 18)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 18)) >> (18 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; -} - - - - -void __integratedfastpack19(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 19) ; - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 12); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 12 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 5 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 11); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 11 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 17); - ++in; - *out |= ((*in - in[-1])) << 17 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 3 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 9); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 9 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 15); - ++in; - *out |= ((*in - in[-1])) << 15 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 1 ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 7); - ++in; - *out |= ((*in - in[-1]) % (1U << 19)) << 7 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 19)) >> (19 - 13); - ++in; - *out |= ((*in - in[-1])) << 13 ; -} - - - - -void __integratedfastpack20(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 20) ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 20)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 20)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 20) ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 20)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 20)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 20) ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 20)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 20)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 20) ; - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 20)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 20)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 20)) >> (20 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; -} - - - - -void __integratedfastpack21(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 21) ; - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 10); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 10 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 9); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 9 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 19); - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 7); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 7 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 17); - ++in; - *out |= ((*in - in[-1])) << 17 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 5 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 15); - ++in; - *out |= ((*in - in[-1])) << 15 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 3 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 13); - ++in; - *out |= ((*in - in[-1])) << 13 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 21)) << 1 ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 21)) >> (21 - 11); - ++in; - *out |= ((*in - in[-1])) << 11 ; -} - - - - -void __integratedfastpack22(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 22) ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 22)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 22)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 22)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 22)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 22) ; - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 22)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 22)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 22)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 22)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 22)) >> (22 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; -} - - - - -void __integratedfastpack23(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 23) ; - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 23)) << 5 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 19); - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 23)) << 1 ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 15); - ++in; - *out |= ((*in - in[-1])) << 15 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 23)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 11); - ++in; - *out |= ((*in - in[-1])) << 11 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 23)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 7); - ++in; - *out |= ((*in - in[-1]) % (1U << 23)) << 7 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 21); - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 23)) << 3 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 17); - ++in; - *out |= ((*in - in[-1])) << 17 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 8); - ++in; - *out |= ((*in - in[-1]) % (1U << 23)) << 8 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 13); - ++in; - *out |= ((*in - in[-1])) << 13 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 23)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 23)) >> (23 - 9); - ++in; - *out |= ((*in - in[-1])) << 9 ; -} - - - - -void __integratedfastpack24(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 24) ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 24) ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 24) ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 24) ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 24) ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 24) ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 24) ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 24) ; - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 24)) >> (24 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; -} - - - - -void __integratedfastpack25(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 25) ; - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 11); - ++in; - *out |= ((*in - in[-1])) << 11 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 25)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 15); - ++in; - *out |= ((*in - in[-1])) << 15 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 25)) << 1 ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 19); - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 5); - ++in; - *out |= ((*in - in[-1]) % (1U << 25)) << 5 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 23); - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 9); - ++in; - *out |= ((*in - in[-1])) << 9 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 25)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 13); - ++in; - *out |= ((*in - in[-1])) << 13 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 6); - ++in; - *out |= ((*in - in[-1]) % (1U << 25)) << 6 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 17); - ++in; - *out |= ((*in - in[-1])) << 17 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 25)) << 3 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 21); - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 25)) >> (25 - 7); - ++in; - *out |= ((*in - in[-1])) << 7 ; -} - - - - -void __integratedfastpack26(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 26) ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 26)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 26)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 6); - ++in; - *out |= ((*in - in[-1])) << 6 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 26) ; - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 26)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 26)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 26)) >> (26 - 6); - ++in; - *out |= ((*in - in[-1])) << 6 ; -} - - - - -void __integratedfastpack27(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 27) ; - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 17); - ++in; - *out |= ((*in - in[-1])) << 17 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 7); - ++in; - *out |= ((*in - in[-1])) << 7 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 27)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 19); - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 9); - ++in; - *out |= ((*in - in[-1])) << 9 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 4); - ++in; - *out |= ((*in - in[-1]) % (1U << 27)) << 4 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 26); - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 21); - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 11); - ++in; - *out |= ((*in - in[-1])) << 11 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 6); - ++in; - *out |= ((*in - in[-1])) << 6 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 27)) << 1 ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 23); - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 13); - ++in; - *out |= ((*in - in[-1])) << 13 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 3); - ++in; - *out |= ((*in - in[-1]) % (1U << 27)) << 3 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 25); - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 15); - ++in; - *out |= ((*in - in[-1])) << 15 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 27)) >> (27 - 5); - ++in; - *out |= ((*in - in[-1])) << 5 ; -} - - - - -void __integratedfastpack28(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 28) ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 4); - ++in; - *out |= ((*in - in[-1])) << 4 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 28) ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 4); - ++in; - *out |= ((*in - in[-1])) << 4 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 28) ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 4); - ++in; - *out |= ((*in - in[-1])) << 4 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 28) ; - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 28)) >> (28 - 4); - ++in; - *out |= ((*in - in[-1])) << 4 ; -} - - - - -void __integratedfastpack29(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 29) ; - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 26); - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 23); - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 17); - ++in; - *out |= ((*in - in[-1])) << 17 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 11); - ++in; - *out |= ((*in - in[-1])) << 11 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 5); - ++in; - *out |= ((*in - in[-1])) << 5 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 2); - ++in; - *out |= ((*in - in[-1]) % (1U << 29)) << 2 ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 28); - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 25); - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 19); - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 13); - ++in; - *out |= ((*in - in[-1])) << 13 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 7); - ++in; - *out |= ((*in - in[-1])) << 7 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 4); - ++in; - *out |= ((*in - in[-1])) << 4 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 1); - ++in; - *out |= ((*in - in[-1]) % (1U << 29)) << 1 ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 27); - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 21); - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 15); - ++in; - *out |= ((*in - in[-1])) << 15 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 9); - ++in; - *out |= ((*in - in[-1])) << 9 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 6); - ++in; - *out |= ((*in - in[-1])) << 6 ; - ++out; - *out = ((*in - in[-1]) % (1U << 29)) >> (29 - 3); - ++in; - *out |= ((*in - in[-1])) << 3 ; -} - - - - -void __integratedfastpack30(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 30) ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 28); - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 26); - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 6); - ++in; - *out |= ((*in - in[-1])) << 6 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 4); - ++in; - *out |= ((*in - in[-1])) << 4 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 2); - ++in; - *out |= ((*in - in[-1])) << 2 ; - ++out; - ++in; - *out = (*in - in[-1]) % (1U << 30) ; - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 28); - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 26); - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 6); - ++in; - *out |= ((*in - in[-1])) << 6 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 4); - ++in; - *out |= ((*in - in[-1])) << 4 ; - ++out; - *out = ((*in - in[-1]) % (1U << 30)) >> (30 - 2); - ++in; - *out |= ((*in - in[-1])) << 2 ; -} - - - - -void __integratedfastpack31(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = (*in - initoffset) % (1U << 31) ; - ++in; - *out |= ((*in - in[-1])) << 31 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 30); - ++in; - *out |= ((*in - in[-1])) << 30 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 29); - ++in; - *out |= ((*in - in[-1])) << 29 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 28); - ++in; - *out |= ((*in - in[-1])) << 28 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 27); - ++in; - *out |= ((*in - in[-1])) << 27 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 26); - ++in; - *out |= ((*in - in[-1])) << 26 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 25); - ++in; - *out |= ((*in - in[-1])) << 25 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 24); - ++in; - *out |= ((*in - in[-1])) << 24 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 23); - ++in; - *out |= ((*in - in[-1])) << 23 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 22); - ++in; - *out |= ((*in - in[-1])) << 22 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 21); - ++in; - *out |= ((*in - in[-1])) << 21 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 20); - ++in; - *out |= ((*in - in[-1])) << 20 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 19); - ++in; - *out |= ((*in - in[-1])) << 19 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 18); - ++in; - *out |= ((*in - in[-1])) << 18 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 17); - ++in; - *out |= ((*in - in[-1])) << 17 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 16); - ++in; - *out |= ((*in - in[-1])) << 16 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 15); - ++in; - *out |= ((*in - in[-1])) << 15 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 14); - ++in; - *out |= ((*in - in[-1])) << 14 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 13); - ++in; - *out |= ((*in - in[-1])) << 13 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 12); - ++in; - *out |= ((*in - in[-1])) << 12 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 11); - ++in; - *out |= ((*in - in[-1])) << 11 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 10); - ++in; - *out |= ((*in - in[-1])) << 10 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 9); - ++in; - *out |= ((*in - in[-1])) << 9 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 8); - ++in; - *out |= ((*in - in[-1])) << 8 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 7); - ++in; - *out |= ((*in - in[-1])) << 7 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 6); - ++in; - *out |= ((*in - in[-1])) << 6 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 5); - ++in; - *out |= ((*in - in[-1])) << 5 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 4); - ++in; - *out |= ((*in - in[-1])) << 4 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 3); - ++in; - *out |= ((*in - in[-1])) << 3 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 2); - ++in; - *out |= ((*in - in[-1])) << 2 ; - ++out; - *out = ((*in - in[-1]) % (1U << 31)) >> (31 - 1); - ++in; - *out |= ((*in - in[-1])) << 1 ; -} - - - -/*assumes that integers fit in the prescribed number of bits*/ -void __integratedfastpack1(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = *(in++) - initoffset ; - for (uint32_t i = 1 ; i < 32; i += 1) { - *out |= (*in - in[-1]) << i ; - ++in ; - } -} - - - -/*assumes that integers fit in the prescribed number of bits*/ -void __integratedfastpack4(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = *(in++) - initoffset ; - for (uint32_t i = 4 ; i < 32; i += 4) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 4 ; i < 32; i += 4) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 4 ; i < 32; i += 4) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 4 ; i < 32; i += 4) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; -} - - - -/*assumes that integers fit in the prescribed number of bits*/ -void __integratedfastpack8(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = *(in++) - initoffset ; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 8 ; i < 32; i += 8) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; -} - - +/*assumes that integers fit in the prescribed number of bits*/ +void __integratedfastpack8(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = *(in++) - initoffset; + for (uint32_t i = 8; i < 32; i += 8) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 8; i < 32; i += 8) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 8; i < 32; i += 8) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 8; i < 32; i += 8) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 8; i < 32; i += 8) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 8; i < 32; i += 8) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 8; i < 32; i += 8) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 8; i < 32; i += 8) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; +} /*assumes that integers fit in the prescribed number of bits*/ -void __integratedfastpack16(const uint32_t initoffset, const uint32_t *__restrict__ in, - uint32_t *__restrict__ out) { - *out = *(in++) - initoffset ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; - *out = *in - in[-1] ; - ++in ; - for (uint32_t i = 16 ; i < 32; i += 16) { - *out |= (*in - in[-1]) << i ; - ++in ; - } - ++out; +void __integratedfastpack16(const uint32_t initoffset, + const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out = *(in++) - initoffset; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; + *out = *in - in[-1]; + ++in; + for (uint32_t i = 16; i < 32; i += 16) { + *out |= (*in - in[-1]) << i; + ++in; + } + ++out; } } // namespace SIMDCompressionLib diff --git a/src/intersection.cpp b/src/intersection.cpp index c7488b7..f8eb1f9 100644 --- a/src/intersection.cpp +++ b/src/intersection.cpp @@ -21,157 +21,152 @@ namespace SIMDCompressionLib { */ static size_t __frogadvanceUntil(const uint32_t *array, const size_t pos, const size_t length, const size_t min) { - size_t lower = pos + 1; - - // special handling for a possibly common sequential case - if ((lower >= length) or (array[lower] >= min)) { - return lower; - } - - size_t spansize = 1; // could set larger - // bootstrap an upper limit - - while ((lower + spansize < length) and (array[lower + spansize] < min)) - spansize *= 2; - size_t upper = (lower + spansize < length) ? lower + spansize : length - 1; - - if (array[upper] < min) {// means array has no item >= min - return length; - } - - // we know that the next-smallest span was too small - lower += (spansize / 2); - - // else begin binary search - size_t mid = 0; - while (lower + 1 != upper) { - mid = (lower + upper) / 2; - if (array[mid] == min) { - return mid; - } else if (array[mid] < min) - lower = mid; - else - upper = mid; - } - return upper; - + size_t lower = pos + 1; + + // special handling for a possibly common sequential case + if ((lower >= length) or (array[lower] >= min)) { + return lower; + } + + size_t spansize = 1; // could set larger + // bootstrap an upper limit + + while ((lower + spansize < length) and (array[lower + spansize] < min)) + spansize *= 2; + size_t upper = (lower + spansize < length) ? lower + spansize : length - 1; + + if (array[upper] < min) { // means array has no item >= min + return length; + } + + // we know that the next-smallest span was too small + lower += (spansize / 2); + + // else begin binary search + size_t mid = 0; + while (lower + 1 != upper) { + mid = (lower + upper) / 2; + if (array[mid] == min) { + return mid; + } else if (array[mid] < min) + lower = mid; + else + upper = mid; + } + return upper; } - size_t onesidedgallopingintersection(const uint32_t *smallset, - const size_t smalllength, const uint32_t *largeset, + const size_t smalllength, + const uint32_t *largeset, const size_t largelength, uint32_t *out) { - if (largelength < smalllength) return onesidedgallopingintersection(largeset, largelength, smallset, smalllength, out); - if (0 == smalllength) - return 0; - const uint32_t *const initout(out); - size_t k1 = 0, k2 = 0; - while (true) { - if (largeset[k1] < smallset[k2]) { - k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); - if (k1 == largelength) - break; - } -midpoint: - if (smallset[k2] < largeset[k1]) { - ++k2; - if (k2 == smalllength) - break; - } else { - *out++ = smallset[k2]; - ++k2; - if (k2 == smalllength) - break; - k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); - if (k1 == largelength) - break; - goto midpoint; - } + if (largelength < smalllength) + return onesidedgallopingintersection(largeset, largelength, smallset, + smalllength, out); + if (0 == smalllength) + return 0; + const uint32_t *const initout(out); + size_t k1 = 0, k2 = 0; + while (true) { + if (largeset[k1] < smallset[k2]) { + k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); + if (k1 == largelength) + break; } - return out - initout; - + midpoint: + if (smallset[k2] < largeset[k1]) { + ++k2; + if (k2 == smalllength) + break; + } else { + *out++ = smallset[k2]; + ++k2; + if (k2 == smalllength) + break; + k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); + if (k1 == largelength) + break; + goto midpoint; + } + } + return out - initout; } - - /** * Fast scalar scheme designed by N. Kurz. */ -size_t scalar(const uint32_t *A, const size_t lenA, - const uint32_t *B, const size_t lenB, uint32_t *out) { - const uint32_t *const initout(out); - if (lenA == 0 || lenB == 0) - return 0; - - const uint32_t *endA = A + lenA; - const uint32_t *endB = B + lenB; - - while (1) { - while (*A < *B) { -SKIP_FIRST_COMPARE: - if (++A == endA) - return (out - initout); - } - while (*A > *B) { - if (++B == endB) - return (out - initout); - } - if (*A == *B) { - *out++ = *A; - if (++A == endA || ++B == endB) - return (out - initout); - } else { - goto SKIP_FIRST_COMPARE; - } +size_t scalar(const uint32_t *A, const size_t lenA, const uint32_t *B, + const size_t lenB, uint32_t *out) { + const uint32_t *const initout(out); + if (lenA == 0 || lenB == 0) + return 0; + + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) + return (out - initout); + } + while (*A > *B) { + if (++B == endB) + return (out - initout); + } + if (*A == *B) { + *out++ = *A; + if (++A == endA || ++B == endB) + return (out - initout); + } else { + goto SKIP_FIRST_COMPARE; } + } - return (out - initout); // NOTREACHED + return (out - initout); // NOTREACHED } +size_t match_scalar(const uint32_t *A, const size_t lenA, const uint32_t *B, + const size_t lenB, uint32_t *out) { + const uint32_t *initout = out; + if (lenA == 0 || lenB == 0) + return 0; + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; - -size_t match_scalar(const uint32_t *A, const size_t lenA, - const uint32_t *B, const size_t lenB, - uint32_t *out) { - - const uint32_t *initout = out; - if (lenA == 0 || lenB == 0) return 0; - - const uint32_t *endA = A + lenA; - const uint32_t *endB = B + lenB; - - while (1) { - while (*A < *B) { -SKIP_FIRST_COMPARE: - if (++A == endA) goto FINISH; - } - while (*A > *B) { - if (++B == endB) goto FINISH; - } - if (*A == *B) { - *out++ = *A; - if (++A == endA || ++B == endB) goto FINISH; - } else { - goto SKIP_FIRST_COMPARE; - } + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) + goto FINISH; } + while (*A > *B) { + if (++B == endB) + goto FINISH; + } + if (*A == *B) { + *out++ = *A; + if (++A == endA || ++B == endB) + goto FINISH; + } else { + goto SKIP_FIRST_COMPARE; + } + } FINISH: - return (out - initout); + return (out - initout); } - #ifdef __GNUC__ -#define COMPILER_LIKELY(x) __builtin_expect((x),1) -#define COMPILER_RARELY(x) __builtin_expect((x),0) +#define COMPILER_LIKELY(x) __builtin_expect((x), 1) +#define COMPILER_RARELY(x) __builtin_expect((x), 0) #else -#define COMPILER_LIKELY(x) x -#define COMPILER_RARELY(x) x +#define COMPILER_LIKELY(x) x +#define COMPILER_RARELY(x) x #endif - /** * Intersections scheme designed by N. Kurz that works very * well when intersecting an array with another where the density @@ -187,95 +182,96 @@ size_t match_scalar(const uint32_t *A, const size_t lenA, * */ size_t v1(const uint32_t *rare, size_t lenRare, const uint32_t *freq, - size_t lenFreq, uint32_t *matchOut) { - assert(lenRare <= lenFreq); - const uint32_t *matchOrig = matchOut; - if (lenFreq == 0 || lenRare == 0) - return 0; - - const uint64_t kFreqSpace = 2 * 4 * (0 + 1) - 1; - const uint64_t kRareSpace = 0; - - const uint32_t *stopFreq = &freq[lenFreq] - kFreqSpace; - const uint32_t *stopRare = &rare[lenRare] - kRareSpace; - - __m128i Rare; - - __m128i F0, F1; - - if (COMPILER_RARELY((rare >= stopRare) || (freq >= stopFreq))) - goto FINISH_SCALAR; - uint32_t valRare; - valRare = rare[0]; - Rare = _mm_set1_epi32(valRare); - - uint64_t maxFreq; - maxFreq = freq[2 * 4 - 1]; - F0 = _mm_lddqu_si128(reinterpret_cast(freq)); - F1 = _mm_lddqu_si128(reinterpret_cast(freq + 4)); - - if (COMPILER_RARELY(maxFreq < valRare)) - goto ADVANCE_FREQ; - - ADVANCE_RARE: do { - *matchOut = valRare; - rare += 1; - if (COMPILER_RARELY(rare >= stopRare)) { - rare -= 1; - goto FINISH_SCALAR; - } - valRare = rare[0]; // for next iteration - F0 = _mm_cmpeq_epi32(F0, Rare); - F1 = _mm_cmpeq_epi32(F1, Rare); - Rare = _mm_set1_epi32(valRare); - F0 = _mm_or_si128(F0, F1); + size_t lenFreq, uint32_t *matchOut) { + assert(lenRare <= lenFreq); + const uint32_t *matchOrig = matchOut; + if (lenFreq == 0 || lenRare == 0) + return 0; + + const uint64_t kFreqSpace = 2 * 4 * (0 + 1) - 1; + const uint64_t kRareSpace = 0; + + const uint32_t *stopFreq = &freq[lenFreq] - kFreqSpace; + const uint32_t *stopRare = &rare[lenRare] - kRareSpace; + + __m128i Rare; + + __m128i F0, F1; + + if (COMPILER_RARELY((rare >= stopRare) || (freq >= stopFreq))) + goto FINISH_SCALAR; + uint32_t valRare; + valRare = rare[0]; + Rare = _mm_set1_epi32(valRare); + + uint64_t maxFreq; + maxFreq = freq[2 * 4 - 1]; + F0 = _mm_lddqu_si128(reinterpret_cast(freq)); + F1 = _mm_lddqu_si128(reinterpret_cast(freq + 4)); + + if (COMPILER_RARELY(maxFreq < valRare)) + goto ADVANCE_FREQ; + +ADVANCE_RARE: + do { + *matchOut = valRare; + rare += 1; + if (COMPILER_RARELY(rare >= stopRare)) { + rare -= 1; + goto FINISH_SCALAR; + } + valRare = rare[0]; // for next iteration + F0 = _mm_cmpeq_epi32(F0, Rare); + F1 = _mm_cmpeq_epi32(F1, Rare); + Rare = _mm_set1_epi32(valRare); + F0 = _mm_or_si128(F0, F1); #ifdef __SSE4_1__ - if(_mm_testz_si128(F0,F0) == 0) - matchOut++; + if (_mm_testz_si128(F0, F0) == 0) + matchOut++; #else - if (_mm_movemask_epi8(F0)) - matchOut++; + if (_mm_movemask_epi8(F0)) + matchOut++; #endif - F0 = _mm_lddqu_si128(reinterpret_cast(freq)); - F1 = _mm_lddqu_si128(reinterpret_cast(freq + 4)); + F0 = _mm_lddqu_si128(reinterpret_cast(freq)); + F1 = _mm_lddqu_si128(reinterpret_cast(freq + 4)); - } while (maxFreq >= valRare); + } while (maxFreq >= valRare); - uint64_t maxProbe; + uint64_t maxProbe; - ADVANCE_FREQ: do { - const uint64_t kProbe = (0 + 1) * 2 * 4; - const uint32_t *probeFreq = freq + kProbe; +ADVANCE_FREQ: + do { + const uint64_t kProbe = (0 + 1) * 2 * 4; + const uint32_t *probeFreq = freq + kProbe; - if (COMPILER_RARELY(probeFreq >= stopFreq)) { - goto FINISH_SCALAR; - } - maxProbe = freq[(0 + 2) * 2 * 4 - 1]; + if (COMPILER_RARELY(probeFreq >= stopFreq)) { + goto FINISH_SCALAR; + } + maxProbe = freq[(0 + 2) * 2 * 4 - 1]; - freq = probeFreq; + freq = probeFreq; - } while (maxProbe < valRare); + } while (maxProbe < valRare); - maxFreq = maxProbe; + maxFreq = maxProbe; - F0 = _mm_lddqu_si128(reinterpret_cast(freq)); - F1 = _mm_lddqu_si128(reinterpret_cast(freq + 4)); + F0 = _mm_lddqu_si128(reinterpret_cast(freq)); + F1 = _mm_lddqu_si128(reinterpret_cast(freq + 4)); - goto ADVANCE_RARE; + goto ADVANCE_RARE; - size_t count; - FINISH_SCALAR: count = matchOut - matchOrig; + size_t count; +FINISH_SCALAR: + count = matchOut - matchOrig; - lenFreq = stopFreq + kFreqSpace - freq; - lenRare = stopRare + kRareSpace - rare; + lenFreq = stopFreq + kFreqSpace - freq; + lenRare = stopRare + kRareSpace - rare; - size_t tail = match_scalar(freq, lenFreq, rare, lenRare, matchOut); + size_t tail = match_scalar(freq, lenFreq, rare, lenRare, matchOut); - return count + tail; + return count + tail; } - - /** * This intersection function is similar to v1, but is faster when * the difference between lenRare and lenFreq is large, but not too large. @@ -290,113 +286,176 @@ size_t v1(const uint32_t *rare, size_t lenRare, const uint32_t *freq, * * This function DOES NOT use inline assembly instructions. Just intrinsics. */ -size_t v3(const uint32_t *rare, const size_t lenRare, - const uint32_t *freq, const size_t lenFreq, uint32_t *out) { - if (lenFreq == 0 || lenRare == 0) - return 0; - assert(lenRare <= lenFreq); - const uint32_t *const initout(out); - typedef __m128i vec; - const uint32_t veclen = sizeof(vec) / sizeof(uint32_t); - const size_t vecmax = veclen - 1; - const size_t freqspace = 32 * veclen; - const size_t rarespace = 1; - - const uint32_t *stopFreq = freq + lenFreq - freqspace; - const uint32_t *stopRare = rare + lenRare - rarespace; - if (freq > stopFreq) { - return scalar(freq, lenFreq, rare, lenRare, out); +size_t v3(const uint32_t *rare, const size_t lenRare, const uint32_t *freq, + const size_t lenFreq, uint32_t *out) { + if (lenFreq == 0 || lenRare == 0) + return 0; + assert(lenRare <= lenFreq); + const uint32_t *const initout(out); + typedef __m128i vec; + const uint32_t veclen = sizeof(vec) / sizeof(uint32_t); + const size_t vecmax = veclen - 1; + const size_t freqspace = 32 * veclen; + const size_t rarespace = 1; + + const uint32_t *stopFreq = freq + lenFreq - freqspace; + const uint32_t *stopRare = rare + lenRare - rarespace; + if (freq > stopFreq) { + return scalar(freq, lenFreq, rare, lenRare, out); + } + while (freq[veclen * 31 + vecmax] < *rare) { + freq += veclen * 32; + if (freq > stopFreq) + goto FINISH_SCALAR; + } + for (; rare < stopRare; ++rare) { + const uint32_t matchRare = *rare; // nextRare; + const vec Match = _mm_set1_epi32(matchRare); + while (freq[veclen * 31 + vecmax] < matchRare) { // if no match possible + freq += veclen * 32; // advance 32 vectors + if (freq > stopFreq) + goto FINISH_SCALAR; } - while (freq[veclen * 31 + vecmax] < *rare) { - freq += veclen * 32; - if (freq > stopFreq) - goto FINISH_SCALAR; + vec Q0, Q1, Q2, Q3; + if (freq[veclen * 15 + vecmax] >= matchRare) { + if (freq[veclen * 7 + vecmax] < matchRare) { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 8), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 9), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 10), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 11), + Match)); + + Q2 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 12), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 13), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 14), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 15), + Match)); + } else { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 4), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 5), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 6), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 7), + Match)); + Q2 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 0), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 1), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 2), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 3), + Match)); + } + } else { + if (freq[veclen * 23 + vecmax] < matchRare) { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 8 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 9 + 16), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 10 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 11 + 16), + Match)); + + Q2 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 12 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 13 + 16), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 14 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 15 + 16), + Match)); + } else { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 4 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 5 + 16), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 6 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 7 + 16), + Match)); + Q2 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 0 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 1 + 16), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 2 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 3 + 16), + Match)); + } } - for (; rare < stopRare; ++rare) { - const uint32_t matchRare = *rare;//nextRare; - const vec Match = _mm_set1_epi32(matchRare); - while (freq[veclen * 31 + vecmax] < matchRare) { // if no match possible - freq += veclen * 32; // advance 32 vectors - if (freq > stopFreq) - goto FINISH_SCALAR; - } - vec Q0, Q1, Q2, Q3; - if (freq[veclen * 15 + vecmax] >= matchRare) { - if (freq[veclen * 7 + vecmax] < matchRare) { - Q0 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 8), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 9), Match)); - Q1 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 10), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 11), Match)); - - Q2 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 12), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 13), Match)); - Q3 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 14), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 15), Match)); - } else { - Q0 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 4), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 5), Match)); - Q1 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 6), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 7), Match)); - Q2 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 0), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 1), Match)); - Q3 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 2), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 3), Match)); - } - } else { - if (freq[veclen * 23 + vecmax] < matchRare) { - Q0 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 8 + 16), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 9 + 16), Match)); - Q1 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 10 + 16), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 11 + 16), Match)); - - Q2 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 12 + 16), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 13 + 16), Match)); - Q3 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 14 + 16), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 15 + 16), Match)); - } else { - Q0 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 4 + 16), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 5 + 16), Match)); - Q1 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 6 + 16), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 7 + 16), Match)); - Q2 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 0 + 16), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 1 + 16), Match)); - Q3 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 2 + 16), Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 3 + 16), Match)); - } - - } - const vec F0 = _mm_or_si128(_mm_or_si128(Q0, Q1), _mm_or_si128(Q2, Q3)); + const vec F0 = _mm_or_si128(_mm_or_si128(Q0, Q1), _mm_or_si128(Q2, Q3)); #ifdef __SSE4_1__ - if (_mm_testz_si128(F0, F0)) { -#else - if (!_mm_movemask_epi8(F0)) { -#endif - } else { - *out++ = matchRare; - } + if (_mm_testz_si128(F0, F0)) { +#else + if (!_mm_movemask_epi8(F0)) { +#endif + } else { + *out++ = matchRare; } + } -FINISH_SCALAR: return (out - initout) + scalar(freq, - stopFreq + freqspace - freq, rare, stopRare + rarespace - rare, out); +FINISH_SCALAR: + return (out - initout) + scalar(freq, stopFreq + freqspace - freq, rare, + stopRare + rarespace - rare, out); } - /** * This is the SIMD galloping function. This intersection function works well * when lenRare and lenFreq have vastly different values. @@ -412,173 +471,199 @@ FINISH_SCALAR: return (out - initout) + scalar(freq, * This function DOES NOT use assembly. It only relies on intrinsics. */ size_t SIMDgalloping(const uint32_t *rare, const size_t lenRare, - const uint32_t *freq, const size_t lenFreq, uint32_t *out) { - if (lenFreq == 0 || lenRare == 0) - return 0; - assert(lenRare <= lenFreq); - const uint32_t *const initout(out); - typedef __m128i vec; - const uint32_t veclen = sizeof(vec) / sizeof(uint32_t); - const size_t vecmax = veclen - 1; - const size_t freqspace = 32 * veclen; - const size_t rarespace = 1; - - const uint32_t *stopFreq = freq + lenFreq - freqspace; - const uint32_t *stopRare = rare + lenRare - rarespace; - if (freq > stopFreq) { - return scalar(freq, lenFreq, rare, lenRare, out); - } - for (; rare < stopRare; ++rare) { - const uint32_t matchRare = *rare;//nextRare; - const vec Match = _mm_set1_epi32(matchRare); - - if (freq[veclen * 31 + vecmax] < matchRare) { // if no match possible - uint32_t offset = 1; - if (freq + veclen * 32 > stopFreq) { - freq += veclen * 32; - goto FINISH_SCALAR; - } - while (freq[veclen * offset * 32 + veclen * 31 + vecmax] - < matchRare) { // if no match possible - if (freq + veclen * (2 * offset) * 32 <= stopFreq) { - offset *= 2; - } else if (freq + veclen * (offset + 1) * 32 <= stopFreq) { - offset = static_cast((stopFreq - freq) / (veclen * 32)); - //offset += 1; - if (freq[veclen * offset * 32 + veclen * 31 + vecmax] - < matchRare) { - freq += veclen * offset * 32; - goto FINISH_SCALAR; - } else { - break; - } - } else { - freq += veclen * offset * 32; - goto FINISH_SCALAR; - } - } - uint32_t lower = offset / 2; - while (lower + 1 != offset) { - const uint32_t mid = (lower + offset) / 2; - if (freq[veclen * mid * 32 + veclen * 31 + vecmax] - < matchRare) - lower = mid; - else - offset = mid; - } + const uint32_t *freq, const size_t lenFreq, + uint32_t *out) { + if (lenFreq == 0 || lenRare == 0) + return 0; + assert(lenRare <= lenFreq); + const uint32_t *const initout(out); + typedef __m128i vec; + const uint32_t veclen = sizeof(vec) / sizeof(uint32_t); + const size_t vecmax = veclen - 1; + const size_t freqspace = 32 * veclen; + const size_t rarespace = 1; + + const uint32_t *stopFreq = freq + lenFreq - freqspace; + const uint32_t *stopRare = rare + lenRare - rarespace; + if (freq > stopFreq) { + return scalar(freq, lenFreq, rare, lenRare, out); + } + for (; rare < stopRare; ++rare) { + const uint32_t matchRare = *rare; // nextRare; + const vec Match = _mm_set1_epi32(matchRare); + + if (freq[veclen * 31 + vecmax] < matchRare) { // if no match possible + uint32_t offset = 1; + if (freq + veclen * 32 > stopFreq) { + freq += veclen * 32; + goto FINISH_SCALAR; + } + while (freq[veclen * offset * 32 + veclen * 31 + vecmax] < + matchRare) { // if no match possible + if (freq + veclen * (2 * offset) * 32 <= stopFreq) { + offset *= 2; + } else if (freq + veclen * (offset + 1) * 32 <= stopFreq) { + offset = static_cast((stopFreq - freq) / (veclen * 32)); + // offset += 1; + if (freq[veclen * offset * 32 + veclen * 31 + vecmax] < matchRare) { freq += veclen * offset * 32; - } - vec Q0, Q1, Q2, Q3; - if (freq[veclen * 15 + vecmax] >= matchRare) { - if (freq[veclen * 7 + vecmax] < matchRare) { - Q0 - = _mm_or_si128( - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 8), Match), - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 9), Match)); - Q1 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 10), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 11), - Match)); - - Q2 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 12), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 13), - Match)); - Q3 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 14), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 15), - Match)); - } else { - Q0 - = _mm_or_si128( - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 4), Match), - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 5), Match)); - Q1 - = _mm_or_si128( - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 6), Match), - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 7), Match)); - Q2 - = _mm_or_si128( - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 0), Match), - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 1), Match)); - Q3 - = _mm_or_si128( - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 2), Match), - _mm_cmpeq_epi32( - _mm_loadu_si128(reinterpret_cast(freq) + 3), Match)); - } + goto FINISH_SCALAR; + } else { + break; + } } else { - if (freq[veclen * 23 + vecmax] < matchRare) { - Q0 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 8 + 16), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 9 + 16), - Match)); - Q1 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 10 + 16), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 11 + 16), - Match)); - - Q2 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 12 + 16), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 13 + 16), - Match)); - Q3 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 14 + 16), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 15 + 16), - Match)); - } else { - Q0 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 4 + 16), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 5 + 16), - Match)); - Q1 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 6 + 16), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 7 + 16), - Match)); - Q2 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 0 + 16), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 1 + 16), - Match)); - Q3 = _mm_or_si128( - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 2 + 16), - Match), - _mm_cmpeq_epi32(_mm_loadu_si128(reinterpret_cast(freq) + 3 + 16), - Match)); - } - + freq += veclen * offset * 32; + goto FINISH_SCALAR; } - const vec F0 = _mm_or_si128(_mm_or_si128(Q0, Q1), _mm_or_si128(Q2, Q3)); + } + uint32_t lower = offset / 2; + while (lower + 1 != offset) { + const uint32_t mid = (lower + offset) / 2; + if (freq[veclen * mid * 32 + veclen * 31 + vecmax] < matchRare) + lower = mid; + else + offset = mid; + } + freq += veclen * offset * 32; + } + vec Q0, Q1, Q2, Q3; + if (freq[veclen * 15 + vecmax] >= matchRare) { + if (freq[veclen * 7 + vecmax] < matchRare) { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 8), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 9), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 10), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 11), + Match)); + + Q2 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 12), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 13), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 14), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 15), + Match)); + } else { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 4), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 5), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 6), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 7), + Match)); + Q2 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 0), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 1), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 2), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 3), + Match)); + } + } else { + if (freq[veclen * 23 + vecmax] < matchRare) { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 8 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 9 + 16), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 10 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 11 + 16), + Match)); + + Q2 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 12 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 13 + 16), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 14 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 15 + 16), + Match)); + } else { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 4 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 5 + 16), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 6 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 7 + 16), + Match)); + Q2 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 0 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 1 + 16), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 2 + 16), + Match), + _mm_cmpeq_epi32( + _mm_loadu_si128(reinterpret_cast(freq) + 3 + 16), + Match)); + } + } + const vec F0 = _mm_or_si128(_mm_or_si128(Q0, Q1), _mm_or_si128(Q2, Q3)); #ifdef __SSE4_1__ - if (_mm_testz_si128(F0, F0)) { -#else - if (!_mm_movemask_epi8(F0)) { -#endif - } else { - *out++ = matchRare; - } + if (_mm_testz_si128(F0, F0)) { +#else + if (!_mm_movemask_epi8(F0)) { +#endif + } else { + *out++ = matchRare; } + } -FINISH_SCALAR: return (out - initout) + scalar(freq, - stopFreq + freqspace - freq, rare, stopRare + rarespace - rare, out); +FINISH_SCALAR: + return (out - initout) + scalar(freq, stopFreq + freqspace - freq, rare, + stopRare + rarespace - rare, out); } /** @@ -587,215 +672,223 @@ FINISH_SCALAR: return (out - initout) + scalar(freq, * The out pointer can be set1 if length1<=length2, * or else it can be set2 if length1>length2. */ -size_t SIMDintersection(const uint32_t *set1, - const size_t length1, const uint32_t *set2, const size_t length2, uint32_t *out) { - if ((length1 == 0) or (length2 == 0)) return 0; - +size_t SIMDintersection(const uint32_t *set1, const size_t length1, + const uint32_t *set2, const size_t length2, + uint32_t *out) { + if ((length1 == 0) or (length2 == 0)) + return 0; - if ((1000 * length1 <= length2) or (1000 * length2 <= length1)) { - if (length1 <= length2) - return SIMDgalloping(set1, length1, set2, length2, out); - else - return SIMDgalloping(set2, length2, set1, length1, out); - } - - if ((50 * length1 <= length2) or (50 * length2 <= length1)) { - if (length1 <= length2) - return v3(set1, length1, set2, length2, out); - else - return v3(set2, length2, set1, length1, out); - } + if ((1000 * length1 <= length2) or (1000 * length2 <= length1)) { + if (length1 <= length2) + return SIMDgalloping(set1, length1, set2, length2, out); + else + return SIMDgalloping(set2, length2, set1, length1, out); + } + if ((50 * length1 <= length2) or (50 * length2 <= length1)) { if (length1 <= length2) - return v1(set1, length1, set2, length2, out); + return v3(set1, length1, set2, length2, out); else - return v1(set2, length2, set1, length1, out); + return v3(set2, length2, set1, length1, out); + } + + if (length1 <= length2) + return v1(set1, length1, set2, length2, out); + else + return v1(set2, length2, set1, length1, out); } /** * More or less from * http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ */ const static __m128i shuffle_mask[16] = { - _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), - _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), - _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,7,6,5,4), - _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), - _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,11,10,9,8), - _mm_set_epi8(15,14,13,12,11,10,9,8,11,10,9,8,3,2,1,0), - _mm_set_epi8(15,14,13,12,11,10,9,8,11,10,9,8,7,6,5,4), - _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), - _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,15,14,13,12), - _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,3,2,1,0), - _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,7,6,5,4), - _mm_set_epi8(15,14,13,12,15,14,13,12,7,6,5,4,3,2,1,0), - _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8), - _mm_set_epi8(15,14,13,12,15,14,13,12,11,10,9,8,3,2,1,0), - _mm_set_epi8(15,14,13,12,15,14,13,12,11,10,9,8,7,6,5,4), - _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), - }; + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 11, 10, 9, 8), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 3, 2, 1, 0), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 7, 6, 5, 4), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 15, 14, 13, 12), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 7, 6, 5, 4), + _mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 7, 6, 5, 4, 3, 2, 1, 0), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 11, 10, 9, 8), + _mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 3, 2, 1, 0), + _mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4), + _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), +}; // precomputed dictionary - /** - * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ + * Taken almost verbatim from + * http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ * * It is not safe for out to be either A or B. */ size_t highlyscalable_intersect_SIMD(const uint32_t *A, const size_t s_a, - const uint32_t *B, const size_t s_b, uint32_t * out) { - assert(out != A); - assert(out != B); - const uint32_t * const initout (out); - size_t i_a = 0, i_b = 0; - - // trim lengths to be a multiple of 4 - size_t st_a = (s_a / 4) * 4; - size_t st_b = (s_b / 4) * 4; - - while (i_a < st_a && i_b < st_b) { - //[ load segments of four 32-bit elements - __m128i v_a = _mm_loadu_si128((__m128i *) &A[i_a]); - __m128i v_b = _mm_loadu_si128((__m128i *) &B[i_b]); - //] - - //[ move pointers - const uint32_t a_max = A[i_a + 3]; - const uint32_t b_max = B[i_b + 3]; - i_a += (a_max <= b_max) * 4; - i_b += (a_max >= b_max) * 4; - //] - - //[ compute mask of common elements - const uint32_t cyclic_shift = _MM_SHUFFLE(0, 3, 2, 1); - __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison - v_b = _mm_shuffle_epi32(v_b, cyclic_shift); // shuffling - __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, v_b); // again... - v_b = _mm_shuffle_epi32(v_b, cyclic_shift); - __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, v_b); // and again... - v_b = _mm_shuffle_epi32(v_b, cyclic_shift); - __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, v_b); // and again. - __m128i cmp_mask = _mm_or_si128(_mm_or_si128(cmp_mask1, cmp_mask2), - _mm_or_si128(cmp_mask3, cmp_mask4)); // OR-ing of comparison masks - // convert the 128-bit mask to the 4-bit mask - const int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp_mask)); - //] - - //[ copy out common elements - const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]); - _mm_storeu_si128((__m128i*)out, p); - out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask - //] - } - - // intersect the tail using scalar intersection - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - i_a++; - } else if (B[i_b] < A[i_a]) { - i_b++; - } else { - *out++ = B[i_b]; ; - i_a++; - i_b++; - } + const uint32_t *B, const size_t s_b, + uint32_t *out) { + assert(out != A); + assert(out != B); + const uint32_t *const initout(out); + size_t i_a = 0, i_b = 0; + + // trim lengths to be a multiple of 4 + size_t st_a = (s_a / 4) * 4; + size_t st_b = (s_b / 4) * 4; + + while (i_a < st_a && i_b < st_b) { + //[ load segments of four 32-bit elements + __m128i v_a = _mm_loadu_si128((__m128i *)&A[i_a]); + __m128i v_b = _mm_loadu_si128((__m128i *)&B[i_b]); + //] + + //[ move pointers + const uint32_t a_max = A[i_a + 3]; + const uint32_t b_max = B[i_b + 3]; + i_a += (a_max <= b_max) * 4; + i_b += (a_max >= b_max) * 4; + //] + + //[ compute mask of common elements + const uint32_t cyclic_shift = _MM_SHUFFLE(0, 3, 2, 1); + __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison + v_b = _mm_shuffle_epi32(v_b, cyclic_shift); // shuffling + __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, v_b); // again... + v_b = _mm_shuffle_epi32(v_b, cyclic_shift); + __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, v_b); // and again... + v_b = _mm_shuffle_epi32(v_b, cyclic_shift); + __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, v_b); // and again. + __m128i cmp_mask = _mm_or_si128( + _mm_or_si128(cmp_mask1, cmp_mask2), + _mm_or_si128(cmp_mask3, cmp_mask4)); // OR-ing of comparison masks + // convert the 128-bit mask to the 4-bit mask + const int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp_mask)); + //] + + //[ copy out common elements + const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]); + _mm_storeu_si128((__m128i *)out, p); + out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask + //] + } + + // intersect the tail using scalar intersection + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + i_a++; + } else if (B[i_b] < A[i_a]) { + i_b++; + } else { + *out++ = B[i_b]; + ; + i_a++; + i_b++; } + } - return out - initout; + return out - initout; } /** * Version optimized by D. Lemire - * starting from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ + * starting from + * http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ * - * The main difference is that we break the data dependency and maximizes superscalar execution. + * The main difference is that we break the data dependency and maximizes + * superscalar execution. * * It is not safe for out to be either A or B. */ size_t lemire_highlyscalable_intersect_SIMD(const uint32_t *A, const size_t s_a, - const uint32_t *B, const size_t s_b, uint32_t * out) { - assert(out != A); - assert(out != B); - const uint32_t * const initout (out); - size_t i_a = 0, i_b = 0; - const static uint32_t cyclic_shift1 = _MM_SHUFFLE(0, 3, 2, 1); - const static uint32_t cyclic_shift2 = _MM_SHUFFLE(1, 0, 3, 2); - const static uint32_t cyclic_shift3 = _MM_SHUFFLE(2, 1, 0, 3); - - // trim lengths to be a multiple of 4 - size_t st_a = (s_a / 4) * 4; - size_t st_b = (s_b / 4) * 4; - if (i_a < st_a && i_b < st_b) { - __m128i v_a, v_b; - v_a = MM_LOAD_SI_128((__m128i *) &A[i_a]); - v_b = MM_LOAD_SI_128((__m128i *) &B[i_b]); - while (true) { - const __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison - const __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, - _mm_shuffle_epi32(v_b, cyclic_shift1)); // again... - __m128i cmp_mask = _mm_or_si128(cmp_mask1, cmp_mask2); - const __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, - _mm_shuffle_epi32(v_b, cyclic_shift2)); // and again... - cmp_mask = _mm_or_si128(cmp_mask, cmp_mask3); - const __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, - _mm_shuffle_epi32(v_b, cyclic_shift3)); // and again. - cmp_mask = _mm_or_si128(cmp_mask, cmp_mask4); - // convert the 128-bit mask to the 4-bit mask - const int mask = _mm_movemask_ps(*reinterpret_cast<__m128*>(&cmp_mask)); - // copy out common elements - const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]); - - _mm_storeu_si128((__m128i*)out, p); - out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask - - const uint32_t a_max = A[i_a + 3]; - if (a_max <= B[i_b + 3]) { - i_a += 4; - if (i_a >= st_a) - break; - v_a = MM_LOAD_SI_128((__m128i *) &A[i_a]); - } - if (a_max >= B[i_b + 3]) { - i_b += 4; - if (i_b >= st_b) - break; - v_b = MM_LOAD_SI_128((__m128i *) &B[i_b]); - } - - } + const uint32_t *B, const size_t s_b, + uint32_t *out) { + assert(out != A); + assert(out != B); + const uint32_t *const initout(out); + size_t i_a = 0, i_b = 0; + const static uint32_t cyclic_shift1 = _MM_SHUFFLE(0, 3, 2, 1); + const static uint32_t cyclic_shift2 = _MM_SHUFFLE(1, 0, 3, 2); + const static uint32_t cyclic_shift3 = _MM_SHUFFLE(2, 1, 0, 3); + + // trim lengths to be a multiple of 4 + size_t st_a = (s_a / 4) * 4; + size_t st_b = (s_b / 4) * 4; + if (i_a < st_a && i_b < st_b) { + __m128i v_a, v_b; + v_a = MM_LOAD_SI_128((__m128i *)&A[i_a]); + v_b = MM_LOAD_SI_128((__m128i *)&B[i_b]); + while (true) { + const __m128i cmp_mask1 = + _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison + const __m128i cmp_mask2 = _mm_cmpeq_epi32( + v_a, _mm_shuffle_epi32(v_b, cyclic_shift1)); // again... + __m128i cmp_mask = _mm_or_si128(cmp_mask1, cmp_mask2); + const __m128i cmp_mask3 = _mm_cmpeq_epi32( + v_a, _mm_shuffle_epi32(v_b, cyclic_shift2)); // and again... + cmp_mask = _mm_or_si128(cmp_mask, cmp_mask3); + const __m128i cmp_mask4 = _mm_cmpeq_epi32( + v_a, _mm_shuffle_epi32(v_b, cyclic_shift3)); // and again. + cmp_mask = _mm_or_si128(cmp_mask, cmp_mask4); + // convert the 128-bit mask to the 4-bit mask + const int mask = _mm_movemask_ps(*reinterpret_cast<__m128 *>(&cmp_mask)); + // copy out common elements + const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]); + + _mm_storeu_si128((__m128i *)out, p); + out += + _mm_popcnt_u32(mask); // a number of elements is a weight of the mask + + const uint32_t a_max = A[i_a + 3]; + if (a_max <= B[i_b + 3]) { + i_a += 4; + if (i_a >= st_a) + break; + v_a = MM_LOAD_SI_128((__m128i *)&A[i_a]); + } + if (a_max >= B[i_b + 3]) { + i_b += 4; + if (i_b >= st_b) + break; + v_b = MM_LOAD_SI_128((__m128i *)&B[i_b]); + } } - - // intersect the tail using scalar intersection - while (i_a < s_a && i_b < s_b) { - if (A[i_a] < B[i_b]) { - i_a++; - } else if (B[i_b] < A[i_a]) { - i_b++; - } else { - *out++ = B[i_b]; - i_a++; - i_b++; - } + } + + // intersect the tail using scalar intersection + while (i_a < s_a && i_b < s_b) { + if (A[i_a] < B[i_b]) { + i_a++; + } else if (B[i_b] < A[i_a]) { + i_b++; + } else { + *out++ = B[i_b]; + i_a++; + i_b++; } + } - return out - initout; + return out - initout; } - - -inline std::map initializeintersectionfactory() { - std::map schemes; - schemes[ "simd" ] = SIMDintersection; - schemes[ "galloping" ] = onesidedgallopingintersection; - schemes[ "scalar" ] = scalar; - schemes[ "v1" ] = v1; - schemes["v3"] = v3; - schemes["simdgalloping"] = SIMDgalloping; - schemes["highlyscalable_intersect_SIMD"] = highlyscalable_intersect_SIMD; - schemes["lemire_highlyscalable_intersect_SIMD"] = lemire_highlyscalable_intersect_SIMD; - return schemes; +inline std::map +initializeintersectionfactory() { + std::map schemes; + schemes["simd"] = SIMDintersection; + schemes["galloping"] = onesidedgallopingintersection; + schemes["scalar"] = scalar; + schemes["v1"] = v1; + schemes["v3"] = v3; + schemes["simdgalloping"] = SIMDgalloping; + schemes["highlyscalable_intersect_SIMD"] = highlyscalable_intersect_SIMD; + schemes["lemire_highlyscalable_intersect_SIMD"] = + lemire_highlyscalable_intersect_SIMD; + return schemes; } -std::map IntersectionFactory::intersection_schemes = initializeintersectionfactory(); - +std::map + IntersectionFactory::intersection_schemes = initializeintersectionfactory(); } // namespace SIMDCompressionLib diff --git a/src/simdbitpacking.cpp b/src/simdbitpacking.cpp index b5eb1a5..599ce48 100644 --- a/src/simdbitpacking.cpp +++ b/src/simdbitpacking.cpp @@ -9,13785 +9,13865 @@ namespace SIMDCompressionLib { using namespace std; -void __SIMD_fastpackwithoutmask0(const uint32_t *__restrict__ , __m128i *__restrict__) {} -void __SIMD_fastpack0(const uint32_t *__restrict__ , __m128i *__restrict__) {} +void __SIMD_fastpackwithoutmask0(const uint32_t *__restrict__, + __m128i *__restrict__) {} +void __SIMD_fastpack0(const uint32_t *__restrict__, __m128i *__restrict__) {} /** - * Rest of the code is borrowed from the fastpfor project (file of the same name) - * with the remove of a few functions at the end and removal of the prefix. Also + * Rest of the code is borrowed from the fastpfor project (file of the same + * name) + * with the remove of a few functions at the end and removal of the prefix. + * Also * functions are no longer static. */ -void __SIMD_fastpackwithoutmask1(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); +void __SIMD_fastpackwithoutmask1(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); +} +void __SIMD_fastpackwithoutmask2(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); -} + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastpackwithoutmask2(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask3(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastpackwithoutmask3(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask5(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastpackwithoutmask5(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask6(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastpackwithoutmask6(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask7(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastpackwithoutmask7(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask9(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastpackwithoutmask9(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask10(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastpackwithoutmask10(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask11(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); } +void __SIMD_fastpackwithoutmask12(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastpackwithoutmask11(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask13(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); +} +void __SIMD_fastpackwithoutmask14(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); } +void __SIMD_fastpackwithoutmask15(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); +} +void __SIMD_fastpackwithoutmask17(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); +} -void __SIMD_fastpackwithoutmask12(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); +void __SIMD_fastpackwithoutmask18(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask19(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask20(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask21(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask22(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask23(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask24(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask25(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask26(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask27(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask28(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask29(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask30(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask31(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = MM_LOAD_SI_128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask32(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = MM_LOAD_SI_128(in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask4(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; + for (uint32_t outer = 0; outer < 4; ++outer) { + InReg = MM_LOAD_SI_128(in); OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 1); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 2); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 3); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 4); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = MM_LOAD_SI_128(in + 7); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); MM_STORE_SI_128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - + in += 8; + } } +void __SIMD_fastpackwithoutmask8(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; - -void __SIMD_fastpackwithoutmask13(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - + for (uint32_t outer = 0; outer < 8; ++outer) { + InReg = MM_LOAD_SI_128(in); OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 1); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 2); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + InReg = MM_LOAD_SI_128(in + 3); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); MM_STORE_SI_128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = MM_LOAD_SI_128(++in); + in += 4; + } +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpackwithoutmask16(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + for (uint32_t outer = 0; outer < 16; ++outer) { + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = MM_LOAD_SI_128(in + 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); MM_STORE_SI_128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - + in += 2; + } } +void __SIMD_fastpack1(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 1) - 1); -void __SIMD_fastpackwithoutmask14(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = MM_LOAD_SI_128(++in); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); } +void __SIMD_fastpack2(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); -void __SIMD_fastpackwithoutmask15(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - InReg = MM_LOAD_SI_128(++in); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); } +void __SIMD_fastpack3(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); -void __SIMD_fastpackwithoutmask17(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); } +void __SIMD_fastpack5(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); -void __SIMD_fastpackwithoutmask18(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask19(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask20(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask21(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask22(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask23(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask24(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask25(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask26(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask27(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask28(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask29(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask30(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask31(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - InReg = MM_LOAD_SI_128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask32(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpackwithoutmask4(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg; - - for (uint32_t outer = 0; outer < 4 ; ++outer) { - InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - - InReg = MM_LOAD_SI_128(in + 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - - InReg = MM_LOAD_SI_128(in + 2); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = MM_LOAD_SI_128(in + 3); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - - InReg = MM_LOAD_SI_128(in + 4); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = MM_LOAD_SI_128(in + 5); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - - InReg = MM_LOAD_SI_128(in + 6); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - - InReg = MM_LOAD_SI_128(in + 7); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - - in += 8; - } - -} - - - -void __SIMD_fastpackwithoutmask8(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg; - - for (uint32_t outer = 0; outer < 8 ; ++outer) { - InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - - InReg = MM_LOAD_SI_128(in + 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = MM_LOAD_SI_128(in + 2); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = MM_LOAD_SI_128(in + 3); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - - in += 4; - } - -} - - - -void __SIMD_fastpackwithoutmask16(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg; - - for (uint32_t outer = 0; outer < 16 ; ++outer) { - InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - - InReg = MM_LOAD_SI_128(in + 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - - in += 2; - } - -} - - - -void __SIMD_fastpack1(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 1) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack2(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 2) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack3(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 3) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack5(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 5) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack6(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 6) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack7(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack9(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 9) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack10(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 10) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack11(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack12(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 12) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack13(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 13) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack14(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 14) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack15(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 15) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack17(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 17) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack18(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 18) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack19(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 19) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack20(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 20) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack21(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 21) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack22(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 22) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack23(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 23) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack24(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 24) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack25(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 25) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack26(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 26) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack27(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 27) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack28(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 28) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack29(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 29) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack30(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 30) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - - -} - - - -void __SIMD_fastpack31(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 31) - 1); - - __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); +void __SIMD_fastpack6(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -void __SIMD_fastpack32(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - __m128i InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpack7(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - ++out; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -void __SIMD_fastpack4(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U << 4) - 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - for (uint32_t outer = 0; outer < 4 ; ++outer) { - InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 2), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 3), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 4), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 5), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 6), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 7), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - in += 8; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); } +void __SIMD_fastpack9(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); -void __SIMD_fastpack8(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U << 8) - 1); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - for (uint32_t outer = 0; outer < 8 ; ++outer) { - InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 2), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 3), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - in += 4; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -void __SIMD_fastpack16(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U << 16) - 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - for (uint32_t outer = 0; outer < 16 ; ++outer) { - InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - InReg = _mm_and_si128(MM_LOAD_SI_128(in + 1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - in += 2; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -void __SIMD_fastunpack1(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg1 = MM_LOAD_SI_128(in); - __m128i InReg2 = InReg1; - __m128i OutReg1, OutReg2, OutReg3, OutReg4; - const __m128i mask = _mm_set1_epi32(1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - unsigned shift = 0; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - for (unsigned i = 0; i < 8; ++i) { - OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++) , mask); - OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++) , mask); - OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++) , mask); - OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++) , mask); - MM_STORE_SI_128(out++, OutReg1); - MM_STORE_SI_128(out++, OutReg2); - MM_STORE_SI_128(out++, OutReg3); - MM_STORE_SI_128(out++, OutReg4); - } -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -void __SIMD_fastunpack2(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 2) - 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack10(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 30) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -void __SIMD_fastunpack3(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); +} + +void __SIMD_fastpack11(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 3) - 1); + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 27) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack12(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = _mm_srli_epi32(InReg, 29) ; - MM_STORE_SI_128(out++, OutReg); + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); -void __SIMD_fastunpack4(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 4) - 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); +} + +void __SIMD_fastpack13(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); - MM_STORE_SI_128(out++, OutReg); + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack14(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack15(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack17(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack18(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 28) ; - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack19(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); +} +void __SIMD_fastpack20(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); +} +void __SIMD_fastpack21(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); } +void __SIMD_fastpack22(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); +} +void __SIMD_fastpack23(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); +} +void __SIMD_fastpack24(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); +} -void __SIMD_fastunpack5(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { +void __SIMD_fastpack25(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); +} - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 5) - 1); +void __SIMD_fastpack26(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack27(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack28(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack29(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack30(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack31(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + __m128i InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(MM_LOAD_SI_128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack32(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + InReg = MM_LOAD_SI_128(++in); + + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastpack4(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + for (uint32_t outer = 0; outer < 4; ++outer) { + InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + ++out; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + in += 8; + } +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack8(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); + for (uint32_t outer = 0; outer < 8; ++outer) { + InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + ++out; - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + in += 4; + } +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastpack16(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + for (uint32_t outer = 0; outer < 16; ++outer) { + InReg = _mm_and_si128(MM_LOAD_SI_128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + InReg = _mm_and_si128(MM_LOAD_SI_128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + ++out; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + in += 2; + } +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack1(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg1 = MM_LOAD_SI_128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + unsigned shift = 0; + + for (unsigned i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++), mask); + MM_STORE_SI_128(out++, OutReg1); + MM_STORE_SI_128(out++, OutReg2); + MM_STORE_SI_128(out++, OutReg3); + MM_STORE_SI_128(out++, OutReg4); + } +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack2(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack6(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 6) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack3(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 27), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack7(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 7) - 1); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack4(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); -} + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack8(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 8) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastunpack5(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + MM_STORE_SI_128(out++, OutReg); } +void __SIMD_fastunpack6(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack9(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 9) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack7(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack10(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 10) - 1); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack8(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack11(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 11) - 1); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack9(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack12(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 12) - 1); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastunpack10(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastunpack13(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 13) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack11(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 19) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack14(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 21); + MM_STORE_SI_128(out++, OutReg); +} - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 14) - 1); +void __SIMD_fastunpack12(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 18) ; - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack13(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); -} + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack15(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 15) - 1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack14(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack16(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 16) - 1); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + MM_STORE_SI_128(out++, OutReg); +} - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack15(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastunpack17(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 17) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack16(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack18(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 18) - 1); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack17(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack19(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 19) - 1); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack18(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 13) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack20(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 20) - 1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack19(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack21(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + MM_STORE_SI_128(out++, OutReg); +} + +void __SIMD_fastunpack20(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + MM_STORE_SI_128(out++, OutReg); } +void __SIMD_fastunpack21(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack22(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 22) - 1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 11); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack22(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastunpack23(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 23) - 1); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastunpack23(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 9) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack24(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 24) - 1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 9); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack24(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack25(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 25) - 1); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 9) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack25(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 7) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastunpack26(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 26) - 1); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack26(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 6) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 6); + InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastunpack27(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 27) - 1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 7) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 9) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 6); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack27(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 5) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 6); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastunpack28(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 28) - 1); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 5); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack28(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack29(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 4); + InReg = MM_LOAD_SI_128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 29) - 1); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 4); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack29(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 5) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 5); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 7) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 9) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 3) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); -void __SIMD_fastunpack30(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 30) - 1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 3); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); +void __SIMD_fastunpack30(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 2) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 2); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_srli_epi32(InReg, 2) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); -void __SIMD_fastunpack31(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 31) - 1); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_and_si128(InReg , mask); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 2); + MM_STORE_SI_128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); - MM_STORE_SI_128(out++, OutReg); +void __SIMD_fastunpack31(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = MM_LOAD_SI_128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 9) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 7) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 5) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 3) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 2) ; - InReg = MM_LOAD_SI_128(++in); + OutReg = _mm_srli_epi32(InReg, 5); + InReg = MM_LOAD_SI_128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); - MM_STORE_SI_128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + MM_STORE_SI_128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 1) ; - MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + MM_STORE_SI_128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 3); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + MM_STORE_SI_128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 2); + InReg = MM_LOAD_SI_128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + MM_STORE_SI_128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg, 1); + MM_STORE_SI_128(out++, OutReg); +} -void __SIMD_fastunpack32(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { - __m128i *out = reinterpret_cast<__m128i *>(_out); - for (uint32_t outer = 0; outer < 32 ; ++outer) { - MM_STORE_SI_128(out++, MM_LOAD_SI_128(in++)); - } +void __SIMD_fastunpack32(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + for (uint32_t outer = 0; outer < 32; ++outer) { + MM_STORE_SI_128(out++, MM_LOAD_SI_128(in++)); + } } } // namespace SIMDCompressionLib diff --git a/src/simdintegratedbitpacking.cpp b/src/simdintegratedbitpacking.cpp index 6b44587..a44c997 100644 --- a/src/simdintegratedbitpacking.cpp +++ b/src/simdintegratedbitpacking.cpp @@ -8,25280 +8,25747 @@ namespace SIMDCompressionLib { - template -__m128i iunpack0(__m128i initOffset, const __m128i *, uint32_t *_out) { - __m128i *out = reinterpret_cast<__m128i *>(_out); - static const __m128i zero = _mm_set1_epi32(0); - - for (unsigned i = 0; i < 8; ++i) { - initOffset = DeltaHelper::PrefixSum(zero, initOffset); - MM_STORE_SI_128(out++, initOffset); - initOffset = DeltaHelper::PrefixSum(zero, initOffset); - MM_STORE_SI_128(out++, initOffset); - initOffset = DeltaHelper::PrefixSum(zero, initOffset); - MM_STORE_SI_128(out++, initOffset); - initOffset = DeltaHelper::PrefixSum(zero, initOffset); - MM_STORE_SI_128(out++, initOffset); - } - - return initOffset; +__m128i iunpack0(__m128i initOffset, const __m128i *, uint32_t *_out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + static const __m128i zero = _mm_set1_epi32(0); + + for (unsigned i = 0; i < 8; ++i) { + initOffset = DeltaHelper::PrefixSum(zero, initOffset); + MM_STORE_SI_128(out++, initOffset); + initOffset = DeltaHelper::PrefixSum(zero, initOffset); + MM_STORE_SI_128(out++, initOffset); + initOffset = DeltaHelper::PrefixSum(zero, initOffset); + MM_STORE_SI_128(out++, initOffset); + initOffset = DeltaHelper::PrefixSum(zero, initOffset); + MM_STORE_SI_128(out++, initOffset); + } + + return initOffset; } - - template -void ipackwithoutmask0(__m128i , const uint32_t *, __m128i *) { - -} +void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *) {} template -void ipack0(__m128i , const uint32_t *, __m128i *) { -} - +void ipack0(__m128i, const uint32_t *, __m128i *) {} template -void ipackwithoutmask1(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask1(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack1(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - +void ipack1(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(1U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask2(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask2(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack2(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(3U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - +void ipack2(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(3U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask3(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask3(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack3(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(7U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - +void ipack3(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(7U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask4(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask4(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack4(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(15U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - +void ipack4(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(15U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask5(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask5(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack5(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(31U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - +void ipack5(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(31U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask6(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask6(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack6(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(63U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - +void ipack6(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(63U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask7(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask7(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack7(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(127U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - +void ipack7(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(127U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask8(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask8(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack8(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(255U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - +void ipack8(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(255U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask9(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask9(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack9(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(511U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - +void ipack9(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(511U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask10(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask10(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack10(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1023U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - +void ipack10(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(1023U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask11(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask11(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack11(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2047U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - +void ipack11(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(2047U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask12(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask12(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack12(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(4095U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - +void ipack12(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(4095U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask13(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask13(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack13(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(8191U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - +void ipack13(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(8191U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask14(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask14(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack14(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(16383U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - +void ipack14(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(16383U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask15(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask15(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack15(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(32767U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - +void ipack15(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(32767U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask16(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask16(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack16(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(65535U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - +void ipack16(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(65535U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask17(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask17(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack17(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(131071U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - +void ipack17(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(131071U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask18(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask18(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack18(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(262143U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - +void ipack18(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(262143U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask19(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask19(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack19(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(524287U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - +void ipack19(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(524287U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask20(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask20(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack20(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1048575U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - +void ipack20(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(1048575U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask21(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask21(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack21(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2097151U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - +void ipack21(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(2097151U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask22(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask22(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack22(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(4194303U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - +void ipack22(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(4194303U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask23(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask23(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack23(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(8388607U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - +void ipack23(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(8388607U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask24(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask24(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack24(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(16777215U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - +void ipack24(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(16777215U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask25(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask25(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack25(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(33554431U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - +void ipack25(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(33554431U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask26(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask26(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack26(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(67108863U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - +void ipack26(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(67108863U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask27(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask27(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack27(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(134217727U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - - +void ipack27(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(134217727U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask28(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask28(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack28(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(268435455U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - +void ipack28(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(268435455U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask29(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask29(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack29(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(536870911U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - MM_STORE_SI_128(out, OutReg); - - +void ipack29(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(536870911U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask30(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask30(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack30(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1073741823U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - - +void ipack30(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(1073741823U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); + + ++out; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask31(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = DeltaHelper::Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - MM_STORE_SI_128(out, OutReg); - - +void ipackwithoutmask31(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipack31(__m128i initOffset, const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2147483647U); ; - - __m128i CurrIn = MM_LOAD_SI_128(in); - __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - MM_STORE_SI_128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - ++in; - CurrIn = MM_LOAD_SI_128(in); - InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - MM_STORE_SI_128(out, OutReg); - - +void ipack31(__m128i initOffset, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32(2147483647U); + ; + + __m128i CurrIn = MM_LOAD_SI_128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + MM_STORE_SI_128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = MM_LOAD_SI_128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + MM_STORE_SI_128(out, OutReg); } - - template -void ipackwithoutmask32(__m128i /* initOffset */ , const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - __m128i InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); - - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); - - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); +void ipackwithoutmask32(__m128i /* initOffset */, const uint32_t *_in, + __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + __m128i InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); -} + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); -template -void ipack32(__m128i /* initOffset */ , const uint32_t *_in, __m128i *out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - __m128i InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); +} - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); +template +void ipack32(__m128i /* initOffset */, const uint32_t *_in, __m128i *out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + __m128i InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); - ++out; - ++in; - InReg = MM_LOAD_SI_128(in); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); - OutReg = InReg; - MM_STORE_SI_128(out, OutReg); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); -} + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); -template -__m128i iunpack1(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 1) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); -} + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); -template -__m128i iunpack2(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 2) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); -} + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); -template -__m128i iunpack3(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 3) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); -} + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); -template -__m128i iunpack4(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 4) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); -} + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); -template -__m128i iunpack5(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 5) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); -} + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); + ++out; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = InReg; + MM_STORE_SI_128(out, OutReg); +} template -__m128i iunpack6(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 6) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack1(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 1) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack7(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack2(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack8(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 8) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack3(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack9(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 9) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack4(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack10(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 10) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack5(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack11(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack6(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack12(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 12) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack7(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack13(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 13) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack8(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack14(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 14) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack9(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack15(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 15) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack10(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack16(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 16) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack11(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack17(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 17) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack12(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack18(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 18) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack13(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack19(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 19) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack14(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack20(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 20) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack15(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack21(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 21) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack16(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack22(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 22) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack17(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack23(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 23) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack18(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack24(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 24) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack19(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack25(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 25) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack20(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack26(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 26) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack21(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack27(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 27) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack22(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack28(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 28) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack23(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack29(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 29) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack24(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - - template -__m128i iunpack30(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 30) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; - +__m128i iunpack25(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } - - +template +__m128i iunpack26(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; +} template -__m128i iunpack31(__m128i initOffset, const __m128i *in, uint32_t *_out) { - - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = MM_LOAD_SI_128(in); - __m128i OutReg; - __m128i tmp; - const __m128i mask = _mm_set1_epi32((1U << 31) - 1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 31); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 30); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 29); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 28); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 27); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 26); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 25); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 24); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 23); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 22); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 21); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 20); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 19); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 18); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 17); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 16); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 15); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 14); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 13); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 12); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 11); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 10); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 9); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 8); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 7); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 6); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 5); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 4); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 3); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 2); - OutReg = tmp; - ++in; InReg = MM_LOAD_SI_128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); - - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg, 1); - OutReg = tmp; - OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); - initOffset = OutReg; - MM_STORE_SI_128(out++, OutReg); - - - return initOffset; +__m128i iunpack27(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; +} +template +__m128i iunpack28(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } +template +__m128i iunpack29(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; +} +template +__m128i iunpack30(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; +} template -__m128i iunpack32(__m128i , const __m128i *in, uint32_t *_out) { - __m128i *mout = reinterpret_cast<__m128i *>(_out); - __m128i invec; - for (size_t k = 0; k < 128 / 4; ++k) { - invec = MM_LOAD_SI_128(in++); - MM_STORE_SI_128(mout++, invec); - } - return invec; +__m128i iunpack31(__m128i initOffset, const __m128i *in, uint32_t *_out) { + + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = MM_LOAD_SI_128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 31); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 30); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 29); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 28); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 27); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 26); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 25); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 24); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 23); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 22); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 21); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 20); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 19); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 18); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 17); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 16); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 15); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 14); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 13); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 12); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 11); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 10); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 9); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 8); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 7); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 6); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 5); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 4); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 3); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 2); + OutReg = tmp; + ++in; + InReg = MM_LOAD_SI_128(in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg, 1); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + MM_STORE_SI_128(out++, OutReg); + + return initOffset; } +template +__m128i iunpack32(__m128i, const __m128i *in, uint32_t *_out) { + __m128i *mout = reinterpret_cast<__m128i *>(_out); + __m128i invec; + for (size_t k = 0; k < 128 / 4; ++k) { + invec = MM_LOAD_SI_128(in++); + MM_STORE_SI_128(mout++, invec); + } + return invec; +} -template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack0(__m128i, const __m128i *, + uint32_t *); template void ipack0(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask0(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack1(__m128i, const __m128i *, + uint32_t *); template void ipack1(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask1(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack2(__m128i, const __m128i *, + uint32_t *); template void ipack2(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask2(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack3(__m128i, const __m128i *, + uint32_t *); template void ipack3(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask3(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack4(__m128i, const __m128i *, + uint32_t *); template void ipack4(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask4(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack5(__m128i, const __m128i *, + uint32_t *); template void ipack5(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask5(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack6(__m128i, const __m128i *, + uint32_t *); template void ipack6(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask6(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack7(__m128i, const __m128i *, + uint32_t *); template void ipack7(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask7(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack8(__m128i, const __m128i *, + uint32_t *); template void ipack8(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask8(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack9(__m128i, const __m128i *, + uint32_t *); template void ipack9(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask9(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack10(__m128i, const __m128i *, + uint32_t *); template void ipack10(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask10(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack11(__m128i, const __m128i *, + uint32_t *); template void ipack11(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask11(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack12(__m128i, const __m128i *, + uint32_t *); template void ipack12(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask12(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack13(__m128i, const __m128i *, + uint32_t *); template void ipack13(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask13(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack14(__m128i, const __m128i *, + uint32_t *); template void ipack14(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask14(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack15(__m128i, const __m128i *, + uint32_t *); template void ipack15(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask15(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack16(__m128i, const __m128i *, + uint32_t *); template void ipack16(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask16(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack17(__m128i, const __m128i *, + uint32_t *); template void ipack17(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask17(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack18(__m128i, const __m128i *, + uint32_t *); template void ipack18(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask18(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack19(__m128i, const __m128i *, + uint32_t *); template void ipack19(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask19(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack20(__m128i, const __m128i *, + uint32_t *); template void ipack20(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask20(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack21(__m128i, const __m128i *, + uint32_t *); template void ipack21(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask21(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack22(__m128i, const __m128i *, + uint32_t *); template void ipack22(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask22(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack23(__m128i, const __m128i *, + uint32_t *); template void ipack23(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask23(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack24(__m128i, const __m128i *, + uint32_t *); template void ipack24(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask24(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack25(__m128i, const __m128i *, + uint32_t *); template void ipack25(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask25(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack26(__m128i, const __m128i *, + uint32_t *); template void ipack26(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask26(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack27(__m128i, const __m128i *, + uint32_t *); template void ipack27(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask27(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack28(__m128i, const __m128i *, + uint32_t *); template void ipack28(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask28(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack29(__m128i, const __m128i *, + uint32_t *); template void ipack29(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask29(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack30(__m128i, const __m128i *, + uint32_t *); template void ipack30(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask30(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack31(__m128i, const __m128i *, + uint32_t *); template void ipack31(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask31(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack32(__m128i, const __m128i *, + uint32_t *); template void ipack32(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask32(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack0(__m128i, const __m128i *, + uint32_t *); template void ipack0(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask0(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack1(__m128i, const __m128i *, + uint32_t *); template void ipack1(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask1(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack2(__m128i, const __m128i *, + uint32_t *); template void ipack2(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask2(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack3(__m128i, const __m128i *, + uint32_t *); template void ipack3(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask3(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack4(__m128i, const __m128i *, + uint32_t *); template void ipack4(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask4(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack5(__m128i, const __m128i *, + uint32_t *); template void ipack5(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask5(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack6(__m128i, const __m128i *, + uint32_t *); template void ipack6(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask6(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack7(__m128i, const __m128i *, + uint32_t *); template void ipack7(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask7(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack8(__m128i, const __m128i *, + uint32_t *); template void ipack8(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask8(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack9(__m128i, const __m128i *, + uint32_t *); template void ipack9(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask9(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack10(__m128i, const __m128i *, + uint32_t *); template void ipack10(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask10(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack11(__m128i, const __m128i *, + uint32_t *); template void ipack11(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask11(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack12(__m128i, const __m128i *, + uint32_t *); template void ipack12(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask12(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack13(__m128i, const __m128i *, + uint32_t *); template void ipack13(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask13(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack14(__m128i, const __m128i *, + uint32_t *); template void ipack14(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask14(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack15(__m128i, const __m128i *, + uint32_t *); template void ipack15(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask15(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack16(__m128i, const __m128i *, + uint32_t *); template void ipack16(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask16(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack17(__m128i, const __m128i *, + uint32_t *); template void ipack17(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask17(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack18(__m128i, const __m128i *, + uint32_t *); template void ipack18(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask18(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack19(__m128i, const __m128i *, + uint32_t *); template void ipack19(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask19(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack20(__m128i, const __m128i *, + uint32_t *); template void ipack20(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask20(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack21(__m128i, const __m128i *, + uint32_t *); template void ipack21(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask21(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack22(__m128i, const __m128i *, + uint32_t *); template void ipack22(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask22(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack23(__m128i, const __m128i *, + uint32_t *); template void ipack23(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask23(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack24(__m128i, const __m128i *, + uint32_t *); template void ipack24(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask24(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack25(__m128i, const __m128i *, + uint32_t *); template void ipack25(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask25(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack26(__m128i, const __m128i *, + uint32_t *); template void ipack26(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask26(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack27(__m128i, const __m128i *, + uint32_t *); template void ipack27(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask27(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack28(__m128i, const __m128i *, + uint32_t *); template void ipack28(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask28(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack29(__m128i, const __m128i *, + uint32_t *); template void ipack29(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask29(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack30(__m128i, const __m128i *, + uint32_t *); template void ipack30(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask30(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack31(__m128i, const __m128i *, + uint32_t *); template void ipack31(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask31(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack32(__m128i, const __m128i *, + uint32_t *); template void ipack32(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask32(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack0(__m128i, const __m128i *, + uint32_t *); template void ipack0(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask0(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack1(__m128i, const __m128i *, + uint32_t *); template void ipack1(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask1(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack2(__m128i, const __m128i *, + uint32_t *); template void ipack2(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask2(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack3(__m128i, const __m128i *, + uint32_t *); template void ipack3(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask3(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack4(__m128i, const __m128i *, + uint32_t *); template void ipack4(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask4(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack5(__m128i, const __m128i *, + uint32_t *); template void ipack5(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask5(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack6(__m128i, const __m128i *, + uint32_t *); template void ipack6(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask6(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack7(__m128i, const __m128i *, + uint32_t *); template void ipack7(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask7(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack8(__m128i, const __m128i *, + uint32_t *); template void ipack8(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask8(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack9(__m128i, const __m128i *, + uint32_t *); template void ipack9(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask9(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack10(__m128i, const __m128i *, + uint32_t *); template void ipack10(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask10(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack11(__m128i, const __m128i *, + uint32_t *); template void ipack11(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask11(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack12(__m128i, const __m128i *, + uint32_t *); template void ipack12(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask12(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack13(__m128i, const __m128i *, + uint32_t *); template void ipack13(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask13(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack14(__m128i, const __m128i *, + uint32_t *); template void ipack14(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask14(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack15(__m128i, const __m128i *, + uint32_t *); template void ipack15(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask15(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack16(__m128i, const __m128i *, + uint32_t *); template void ipack16(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask16(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack17(__m128i, const __m128i *, + uint32_t *); template void ipack17(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask17(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack18(__m128i, const __m128i *, + uint32_t *); template void ipack18(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask18(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack19(__m128i, const __m128i *, + uint32_t *); template void ipack19(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask19(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack20(__m128i, const __m128i *, + uint32_t *); template void ipack20(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask20(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack21(__m128i, const __m128i *, + uint32_t *); template void ipack21(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask21(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack22(__m128i, const __m128i *, + uint32_t *); template void ipack22(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask22(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack23(__m128i, const __m128i *, + uint32_t *); template void ipack23(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask23(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack24(__m128i, const __m128i *, + uint32_t *); template void ipack24(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask24(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack25(__m128i, const __m128i *, + uint32_t *); template void ipack25(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask25(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack26(__m128i, const __m128i *, + uint32_t *); template void ipack26(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask26(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack27(__m128i, const __m128i *, + uint32_t *); template void ipack27(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask27(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack28(__m128i, const __m128i *, + uint32_t *); template void ipack28(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask28(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack29(__m128i, const __m128i *, + uint32_t *); template void ipack29(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask29(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack30(__m128i, const __m128i *, + uint32_t *); template void ipack30(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask30(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack31(__m128i, const __m128i *, + uint32_t *); template void ipack31(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask31(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack32(__m128i, const __m128i *, + uint32_t *); template void ipack32(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask32(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); template void ipack0(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask0(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); template void ipack1(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask1(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); template void ipack2(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask2(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); template void ipack3(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask3(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); template void ipack4(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask4(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); template void ipack5(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask5(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); template void ipack6(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask6(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); template void ipack7(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask7(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); template void ipack8(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask8(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); template void ipack9(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask9(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); template void ipack10(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask10(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); template void ipack11(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask11(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); template void ipack12(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask12(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); template void ipack13(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask13(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); template void ipack14(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask14(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); template void ipack15(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask15(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); template void ipack16(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask16(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); template void ipack17(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask17(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); template void ipack18(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask18(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); template void ipack19(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask19(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); template void ipack20(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask20(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); template void ipack21(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask21(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); template void ipack22(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask22(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); template void ipack23(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask23(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); template void ipack24(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask24(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); template void ipack25(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask25(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); template void ipack26(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask26(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); template void ipack27(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask27(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); template void ipack28(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask28(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); template void ipack29(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask29(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); template void ipack30(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask30(__m128i, const uint32_t *, + __m128i *); - -template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); template void ipack31(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); - +template void ipackwithoutmask31(__m128i, const uint32_t *, + __m128i *); -template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); template void ipack32(__m128i, const uint32_t *, __m128i *); -template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask32(__m128i, const uint32_t *, + __m128i *); } // namespace SIMDCompressionLib diff --git a/src/simdpackedsearch.c b/src/simdpackedsearch.c index 3051e81..8cb73df 100644 --- a/src/simdpackedsearch.c +++ b/src/simdpackedsearch.c @@ -6,11 +6,11 @@ #include "platform.h" #ifdef USE_ALIGNED -# define MM_LOAD_SI_128 _mm_load_si128 -# define MM_STORE_SI_128 _mm_store_si128 +#define MM_LOAD_SI_128 _mm_load_si128 +#define MM_STORE_SI_128 _mm_store_si128 #else -# define MM_LOAD_SI_128 _mm_loadu_si128 -# define MM_STORE_SI_128 _mm_storeu_si128 +#define MM_LOAD_SI_128 _mm_loadu_si128 +#define MM_STORE_SI_128 _mm_storeu_si128 #endif SIMDCOMP_ALIGNED(16) static int8_t shuffle_mask_bytes[256] = { @@ -31,7257 +31,7571 @@ SIMDCOMP_ALIGNED(16) static int8_t shuffle_mask_bytes[256] = { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; -static const __m128i *shuffle_mask = (__m128i *) shuffle_mask_bytes; +static const __m128i *shuffle_mask = (__m128i *)shuffle_mask_bytes; /* should emulate std:lower_bound*/ -static int lower_bound(uint32_t * A, uint32_t key, int imin, int imax) -{ - int imid; - imax --; - while(imin + 1 < imax) { - imid = imin + ((imax - imin) / 2); - - if (A[imid] >= key) { - imax = imid; - } else if (A[imid] < key) { - imin = imid; - } +static int lower_bound(uint32_t *A, uint32_t key, int imin, int imax) { + int imid; + imax--; + while (imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] >= key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; } - if(A[imin] >= key) return imin; - return imax; + } + if (A[imin] >= key) + return imin; + return imax; } - -#define PrefixSum(ret, curr, prev) do { \ - const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ - const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ - ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ - } while (0) - - +#define PrefixSum(ret, curr, prev) \ + do { \ + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ + ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ + } while (0) /* perform a lower-bound search for |key| in |out|; the resulting uint32 * is stored in |*presult|.*/ -#define CHECK_AND_INCREMENT(i, out, key, presult) \ - do { \ - __m128i tmpout = _mm_sub_epi32(out, conversion); \ - uint32_t mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ - if (mask != 15) { \ - __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mask ^ 15]); \ - int offset; \ - offset = __builtin_ctz(mask ^ 15); \ - *presult = _mm_cvtsi128_si32(p); \ - return (i + offset); \ - } \ - i += 4; \ - } while (0) - -static int -iunpacksearch0(__m128i * initOffset , const __m128i * _in, - uint32_t key, uint32_t *presult) -{ - uint32_t repeatedvalue = (uint32_t) _mm_extract_epi32(*initOffset, 3); - if (repeatedvalue >= key) { - *presult = repeatedvalue; - return 0; - } - *presult = key + 1; - (void)_in; - return (128); +#define CHECK_AND_INCREMENT(i, out, key, presult) \ + do { \ + __m128i tmpout = _mm_sub_epi32(out, conversion); \ + uint32_t mask = \ + _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ + if (mask != 15) { \ + __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mask ^ 15]); \ + int offset; \ + offset = __builtin_ctz(mask ^ 15); \ + *presult = _mm_cvtsi128_si32(p); \ + return (i + offset); \ + } \ + i += 4; \ + } while (0) + +static int iunpacksearch0(__m128i *initOffset, const __m128i *_in, uint32_t key, + uint32_t *presult) { + uint32_t repeatedvalue = (uint32_t)_mm_extract_epi32(*initOffset, 3); + if (repeatedvalue >= key) { + *presult = repeatedvalue; + return 0; + } + *presult = key + 1; + (void)_in; + return (128); } -static int -iunpacksearch1(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch1(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<1)-1); + __m128i mask = _mm_set1_epi32((1U << 1) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch2(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch2(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<2)-1); + __m128i mask = _mm_set1_epi32((1U << 2) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch3(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch3(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<3)-1); + __m128i mask = _mm_set1_epi32((1U << 3) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch4(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch4(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<4)-1); + __m128i mask = _mm_set1_epi32((1U << 4) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch5(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch5(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<5)-1); + __m128i mask = _mm_set1_epi32((1U << 5) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch6(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch6(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<6)-1); + __m128i mask = _mm_set1_epi32((1U << 6) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch7(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch7(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<7)-1); + __m128i mask = _mm_set1_epi32((1U << 7) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch8(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch8(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<8)-1); + __m128i mask = _mm_set1_epi32((1U << 8) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch9(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch9(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<9)-1); + __m128i mask = _mm_set1_epi32((1U << 9) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch10(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch10(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<10)-1); + __m128i mask = _mm_set1_epi32((1U << 10) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch11(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch11(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<11)-1); + __m128i mask = _mm_set1_epi32((1U << 11) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch12(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch12(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<12)-1); + __m128i mask = _mm_set1_epi32((1U << 12) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch13(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch13(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<13)-1); + __m128i mask = _mm_set1_epi32((1U << 13) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch14(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch14(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<14)-1); + __m128i mask = _mm_set1_epi32((1U << 14) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch15(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch15(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<15)-1); + __m128i mask = _mm_set1_epi32((1U << 15) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch16(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch16(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<16)-1); + __m128i mask = _mm_set1_epi32((1U << 16) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch17(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch17(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<17)-1); + __m128i mask = _mm_set1_epi32((1U << 17) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch18(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch18(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<18)-1); + __m128i mask = _mm_set1_epi32((1U << 18) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch19(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch19(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<19)-1); + __m128i mask = _mm_set1_epi32((1U << 19) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch20(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch20(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<20)-1); + __m128i mask = _mm_set1_epi32((1U << 20) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch21(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch21(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<21)-1); + __m128i mask = _mm_set1_epi32((1U << 21) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch22(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch22(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<22)-1); + __m128i mask = _mm_set1_epi32((1U << 22) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch23(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch23(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<23)-1); + __m128i mask = _mm_set1_epi32((1U << 23) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch24(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch24(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<24)-1); + __m128i mask = _mm_set1_epi32((1U << 24) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch25(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch25(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<25)-1); + __m128i mask = _mm_set1_epi32((1U << 25) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch26(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch26(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<26)-1); + __m128i mask = _mm_set1_epi32((1U << 26) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch27(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch27(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<27)-1); + __m128i mask = _mm_set1_epi32((1U << 27) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch28(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch28(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<28)-1); + __m128i mask = _mm_set1_epi32((1U << 28) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch29(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch29(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<29)-1); + __m128i mask = _mm_set1_epi32((1U << 29) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch30(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch30(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<30)-1); + __m128i mask = _mm_set1_epi32((1U << 30) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); @@ -7289,671 +7603,755 @@ iunpacksearch30(__m128i * initOffset, const __m128i *in, out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch31(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ +static int iunpacksearch31(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<31)-1); + __m128i mask = _mm_set1_epi32((1U << 31) - 1); __m128i conversion = _mm_set1_epi32(2147483648U); __m128i key4 = _mm_set1_epi32(key - 2147483648U); - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - CHECK_AND_INCREMENT(i, out, key, presult); + CHECK_AND_INCREMENT(i, out, key, presult); *presult = key + 1; return (128); } -static int -iunpacksearch32(__m128i * initOffset, const __m128i *in, - uint32_t key, uint32_t *presult) -{ - uint32_t * in32 = (uint32_t *)in; - int answer = lower_bound(in32, key, 0, 128); - if(in32[answer] < key) { - *presult = key + 1; - return (128); - } - *presult = in32[answer]; - *initOffset = MM_LOAD_SI_128(in + 31); - return answer; - +static int iunpacksearch32(__m128i *initOffset, const __m128i *in, uint32_t key, + uint32_t *presult) { + uint32_t *in32 = (uint32_t *)in; + int answer = lower_bound(in32, key, 0, 128); + if (in32[answer] < key) { + *presult = key + 1; + return (128); + } + *presult = in32[answer]; + *initOffset = MM_LOAD_SI_128(in + 31); + return answer; } +int simdsearchd1(__m128i *initOffset, const __m128i *in, uint32_t bit, + uint32_t key, uint32_t *presult) { + switch (bit) { + case 0: + return iunpacksearch0(initOffset, in, key, presult); -int -simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit, - uint32_t key, uint32_t *presult) -{ - switch (bit) { - case 0: return iunpacksearch0(initOffset, in, key, presult); - - case 1: return iunpacksearch1(initOffset, in, key, presult); + case 1: + return iunpacksearch1(initOffset, in, key, presult); - case 2: return iunpacksearch2(initOffset, in, key, presult); + case 2: + return iunpacksearch2(initOffset, in, key, presult); - case 3: return iunpacksearch3(initOffset, in, key, presult); + case 3: + return iunpacksearch3(initOffset, in, key, presult); - case 4: return iunpacksearch4(initOffset, in, key, presult); + case 4: + return iunpacksearch4(initOffset, in, key, presult); - case 5: return iunpacksearch5(initOffset, in, key, presult); + case 5: + return iunpacksearch5(initOffset, in, key, presult); - case 6: return iunpacksearch6(initOffset, in, key, presult); + case 6: + return iunpacksearch6(initOffset, in, key, presult); - case 7: return iunpacksearch7(initOffset, in, key, presult); + case 7: + return iunpacksearch7(initOffset, in, key, presult); - case 8: return iunpacksearch8(initOffset, in, key, presult); + case 8: + return iunpacksearch8(initOffset, in, key, presult); - case 9: return iunpacksearch9(initOffset, in, key, presult); + case 9: + return iunpacksearch9(initOffset, in, key, presult); - case 10: return iunpacksearch10(initOffset, in, key, presult); + case 10: + return iunpacksearch10(initOffset, in, key, presult); - case 11: return iunpacksearch11(initOffset, in, key, presult); + case 11: + return iunpacksearch11(initOffset, in, key, presult); - case 12: return iunpacksearch12(initOffset, in, key, presult); + case 12: + return iunpacksearch12(initOffset, in, key, presult); - case 13: return iunpacksearch13(initOffset, in, key, presult); + case 13: + return iunpacksearch13(initOffset, in, key, presult); - case 14: return iunpacksearch14(initOffset, in, key, presult); + case 14: + return iunpacksearch14(initOffset, in, key, presult); - case 15: return iunpacksearch15(initOffset, in, key, presult); + case 15: + return iunpacksearch15(initOffset, in, key, presult); - case 16: return iunpacksearch16(initOffset, in, key, presult); + case 16: + return iunpacksearch16(initOffset, in, key, presult); - case 17: return iunpacksearch17(initOffset, in, key, presult); + case 17: + return iunpacksearch17(initOffset, in, key, presult); - case 18: return iunpacksearch18(initOffset, in, key, presult); + case 18: + return iunpacksearch18(initOffset, in, key, presult); - case 19: return iunpacksearch19(initOffset, in, key, presult); + case 19: + return iunpacksearch19(initOffset, in, key, presult); - case 20: return iunpacksearch20(initOffset, in, key, presult); + case 20: + return iunpacksearch20(initOffset, in, key, presult); - case 21: return iunpacksearch21(initOffset, in, key, presult); + case 21: + return iunpacksearch21(initOffset, in, key, presult); - case 22: return iunpacksearch22(initOffset, in, key, presult); + case 22: + return iunpacksearch22(initOffset, in, key, presult); - case 23: return iunpacksearch23(initOffset, in, key, presult); + case 23: + return iunpacksearch23(initOffset, in, key, presult); - case 24: return iunpacksearch24(initOffset, in, key, presult); + case 24: + return iunpacksearch24(initOffset, in, key, presult); - case 25: return iunpacksearch25(initOffset, in, key, presult); + case 25: + return iunpacksearch25(initOffset, in, key, presult); - case 26: return iunpacksearch26(initOffset, in, key, presult); + case 26: + return iunpacksearch26(initOffset, in, key, presult); - case 27: return iunpacksearch27(initOffset, in, key, presult); + case 27: + return iunpacksearch27(initOffset, in, key, presult); - case 28: return iunpacksearch28(initOffset, in, key, presult); + case 28: + return iunpacksearch28(initOffset, in, key, presult); - case 29: return iunpacksearch29(initOffset, in, key, presult); + case 29: + return iunpacksearch29(initOffset, in, key, presult); - case 30: return iunpacksearch30(initOffset, in, key, presult); + case 30: + return iunpacksearch30(initOffset, in, key, presult); - case 31: return iunpacksearch31(initOffset, in, key, presult); + case 31: + return iunpacksearch31(initOffset, in, key, presult); - case 32: return iunpacksearch32(initOffset, in, key, presult); + case 32: + return iunpacksearch32(initOffset, in, key, presult); - default: break; - } - return (-1); + default: + break; + } + return (-1); } diff --git a/src/simdpackedselect.c b/src/simdpackedselect.c index 6361114..207e068 100644 --- a/src/simdpackedselect.c +++ b/src/simdpackedselect.c @@ -6,47 +6,46 @@ #include "platform.h" #ifdef USE_ALIGNED -# define MM_LOAD_SI_128 _mm_load_si128 -# define MM_STORE_SI_128 _mm_store_si128 +#define MM_LOAD_SI_128 _mm_load_si128 +#define MM_STORE_SI_128 _mm_store_si128 #else -# define MM_LOAD_SI_128 _mm_loadu_si128 -# define MM_STORE_SI_128 _mm_storeu_si128 +#define MM_LOAD_SI_128 _mm_loadu_si128 +#define MM_STORE_SI_128 _mm_storeu_si128 #endif -SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes[256] = { - 0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0, - 4,5,6,7,0,0,0,0,0,0,0,0,0,0,0,0, - 8,9,10,11,0,0,0,0,0,0,0,0,0,0,0,0, - 12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0, - }; +SIMDCOMP_ALIGNED(16) +int8_t shuffle_mask_bytes[256] = { + 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; -static const __m128i *shuffle_mask = (__m128i *) shuffle_mask_bytes; +static const __m128i *shuffle_mask = (__m128i *)shuffle_mask_bytes; -uint32_t branchlessextract (__m128i out, int i) { - return _mm_cvtsi128_si32(_mm_shuffle_epi8(out,shuffle_mask[i])); +uint32_t branchlessextract(__m128i out, int i) { + return _mm_cvtsi128_si32(_mm_shuffle_epi8(out, shuffle_mask[i])); } -#define PrefixSum(ret, curr, prev) do { \ - const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ - const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ - ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ - } while (0) - -#define CHECK_AND_INCREMENT(i, out, slot) \ - i += 4; \ - if (i > slot) { \ - return branchlessextract (out, slot - (i - 4)); \ - } - - -static uint32_t -iunpackselect1(__m128i * initOffset, const __m128i *in, int slot) -{ +#define PrefixSum(ret, curr, prev) \ + do { \ + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ + ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ + } while (0) + +#define CHECK_AND_INCREMENT(i, out, slot) \ + i += 4; \ + if (i > slot) { \ + return branchlessextract(out, slot - (i - 4)); \ + } + +static uint32_t iunpackselect1(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<1)-1); + __m128i mask = _mm_set1_epi32((1U << 1) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -54,204 +53,202 @@ iunpackselect1(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect2(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect2(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<2)-1); + __m128i mask = _mm_set1_epi32((1U << 2) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -259,93 +256,94 @@ iunpackselect2(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -356,108 +354,106 @@ iunpackselect2(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect3(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect3(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<3)-1); + __m128i mask = _mm_set1_epi32((1U << 3) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -465,210 +461,210 @@ iunpackselect3(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect4(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect4(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<4)-1); + __m128i mask = _mm_set1_epi32((1U << 4) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -676,45 +672,46 @@ iunpackselect4(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -725,45 +722,46 @@ iunpackselect4(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -774,45 +772,46 @@ iunpackselect4(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -823,60 +822,58 @@ iunpackselect4(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect5(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect5(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<5)-1); + __m128i mask = _mm_set1_epi32((1U << 5) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -884,216 +881,218 @@ iunpackselect5(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect6(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect6(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<6)-1); + __m128i mask = _mm_set1_epi32((1U << 6) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -1101,99 +1100,102 @@ iunpackselect6(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -1204,114 +1206,114 @@ iunpackselect6(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect7(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect7(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<7)-1); + __m128i mask = _mm_set1_epi32((1U << 7) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -1319,222 +1321,226 @@ iunpackselect7(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect8(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<8)-1); + __m128i mask = _mm_set1_epi32((1U << 8) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -1542,21 +1548,22 @@ iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -1567,21 +1574,22 @@ iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -1592,21 +1600,22 @@ iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -1617,21 +1626,22 @@ iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -1642,21 +1652,22 @@ iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -1667,21 +1678,22 @@ iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -1692,21 +1704,22 @@ iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -1717,36 +1730,34 @@ iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect9(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect9(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<9)-1); + __m128i mask = _mm_set1_epi32((1U << 9) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -1754,228 +1765,234 @@ iunpackselect9(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect10(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect10(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<10)-1); + __m128i mask = _mm_set1_epi32((1U << 10) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -1983,105 +2000,110 @@ iunpackselect10(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -2092,120 +2114,122 @@ iunpackselect10(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect11(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect11(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<11)-1); + __m128i mask = _mm_set1_epi32((1U << 11) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -2213,234 +2237,242 @@ iunpackselect11(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect12(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect12(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<12)-1); + __m128i mask = _mm_set1_epi32((1U << 12) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -2448,51 +2480,54 @@ iunpackselect12(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -2503,51 +2538,54 @@ iunpackselect12(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -2558,51 +2596,54 @@ iunpackselect12(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -2613,66 +2654,66 @@ iunpackselect12(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect13(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect13(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<13)-1); + __m128i mask = _mm_set1_epi32((1U << 13) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -2680,240 +2721,250 @@ iunpackselect13(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect14(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect14(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<14)-1); + __m128i mask = _mm_set1_epi32((1U << 14) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -2921,111 +2972,118 @@ iunpackselect14(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3036,126 +3094,130 @@ iunpackselect14(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect15(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect15(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<15)-1); + __m128i mask = _mm_set1_epi32((1U << 15) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -3163,246 +3225,258 @@ iunpackselect15(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect16(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<16)-1); + __m128i mask = _mm_set1_epi32((1U << 16) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -3410,9 +3484,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3423,9 +3498,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3436,9 +3512,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3449,9 +3526,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3462,9 +3540,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3475,9 +3554,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3488,9 +3568,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3501,9 +3582,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3514,9 +3596,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3527,9 +3610,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3540,9 +3624,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3553,9 +3638,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3566,9 +3652,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3579,9 +3666,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3592,9 +3680,10 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -3605,24 +3694,22 @@ iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect17(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect17(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<17)-1); + __m128i mask = _mm_set1_epi32((1U << 17) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -3630,252 +3717,266 @@ iunpackselect17(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect18(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect18(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<18)-1); + __m128i mask = _mm_set1_epi32((1U << 18) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -3883,117 +3984,126 @@ iunpackselect18(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -4004,132 +4114,138 @@ iunpackselect18(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect19(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect19(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<19)-1); + __m128i mask = _mm_set1_epi32((1U << 19) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -4137,258 +4253,274 @@ iunpackselect19(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect20(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect20(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<20)-1); + __m128i mask = _mm_set1_epi32((1U << 20) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -4396,57 +4528,62 @@ iunpackselect20(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -4457,57 +4594,62 @@ iunpackselect20(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -4518,57 +4660,62 @@ iunpackselect20(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -4579,72 +4726,74 @@ iunpackselect20(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect21(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect21(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<21)-1); + __m128i mask = _mm_set1_epi32((1U << 21) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -4652,264 +4801,282 @@ iunpackselect21(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect22(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect22(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<22)-1); + __m128i mask = _mm_set1_epi32((1U << 22) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -4917,123 +5084,134 @@ iunpackselect22(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -5044,138 +5222,146 @@ iunpackselect22(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect23(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect23(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<23)-1); + __m128i mask = _mm_set1_epi32((1U << 23) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -5183,270 +5369,290 @@ iunpackselect23(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect24(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<24)-1); + __m128i mask = _mm_set1_epi32((1U << 24) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -5454,27 +5660,30 @@ iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -5485,27 +5694,30 @@ iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -5516,27 +5728,30 @@ iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -5547,27 +5762,30 @@ iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -5578,27 +5796,30 @@ iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -5609,27 +5830,30 @@ iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -5640,27 +5864,30 @@ iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -5671,42 +5898,42 @@ iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect25(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect25(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<25)-1); + __m128i mask = _mm_set1_epi32((1U << 25) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -5714,276 +5941,298 @@ iunpackselect25(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect26(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect26(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<26)-1); + __m128i mask = _mm_set1_epi32((1U << 26) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -5991,129 +6240,142 @@ iunpackselect26(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -6124,144 +6386,154 @@ iunpackselect26(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect27(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect27(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<27)-1); + __m128i mask = _mm_set1_epi32((1U << 27) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -6269,282 +6541,306 @@ iunpackselect27(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect28(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect28(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<28)-1); + __m128i mask = _mm_set1_epi32((1U << 28) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -6552,63 +6848,70 @@ iunpackselect28(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -6619,63 +6922,70 @@ iunpackselect28(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -6686,63 +6996,70 @@ iunpackselect28(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -6753,78 +7070,82 @@ iunpackselect28(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect29(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect29(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<29)-1); + __m128i mask = _mm_set1_epi32((1U << 29) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -6832,288 +7153,314 @@ iunpackselect29(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect30(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect30(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<30)-1); + __m128i mask = _mm_set1_epi32((1U << 30) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -7121,135 +7468,150 @@ iunpackselect30(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); @@ -7260,150 +7622,162 @@ iunpackselect30(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect31(__m128i * initOffset, const __m128i *in, int slot) -{ +static uint32_t iunpackselect31(__m128i *initOffset, const __m128i *in, + int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<31)-1); + __m128i mask = _mm_set1_epi32((1U << 31) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); @@ -7411,8082 +7785,7576 @@ iunpackselect31(__m128i * initOffset, const __m128i *in, int slot) *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); - return (0); } -static uint32_t -iunpackselect32(__m128i * initOffset , const __m128i *in, int slot) -{ +static uint32_t iunpackselect32(__m128i *initOffset, const __m128i *in, + int slot) { uint32_t *begin = (uint32_t *)in; *initOffset = MM_LOAD_SI_128(in + 31); return begin[slot]; } +uint32_t simdselectd1(__m128i *initOffset, const __m128i *in, uint32_t bit, + int slot) { + slot &= 127; /* to avoid problems */ + switch (bit) { + case 0: + return _mm_extract_epi32(*initOffset, 3); + break; -uint32_t -simdselectd1(__m128i * initOffset, const __m128i *in, uint32_t bit, int slot) -{ - slot &= 127; /* to avoid problems */ - switch (bit) { - case 0: return _mm_extract_epi32(*initOffset,3); break; + case 1: + return iunpackselect1(initOffset, in, slot); + break; - case 1: return iunpackselect1(initOffset, in, slot); break; + case 2: + return iunpackselect2(initOffset, in, slot); + break; - case 2: return iunpackselect2(initOffset, in, slot); break; + case 3: + return iunpackselect3(initOffset, in, slot); + break; - case 3: return iunpackselect3(initOffset, in, slot); break; + case 4: + return iunpackselect4(initOffset, in, slot); + break; - case 4: return iunpackselect4(initOffset, in, slot); break; + case 5: + return iunpackselect5(initOffset, in, slot); + break; - case 5: return iunpackselect5(initOffset, in, slot); break; + case 6: + return iunpackselect6(initOffset, in, slot); + break; - case 6: return iunpackselect6(initOffset, in, slot); break; + case 7: + return iunpackselect7(initOffset, in, slot); + break; - case 7: return iunpackselect7(initOffset, in, slot); break; + case 8: + return iunpackselect8(initOffset, in, slot); + break; - case 8: return iunpackselect8(initOffset, in, slot); break; + case 9: + return iunpackselect9(initOffset, in, slot); + break; - case 9: return iunpackselect9(initOffset, in, slot); break; + case 10: + return iunpackselect10(initOffset, in, slot); + break; - case 10: return iunpackselect10(initOffset, in, slot); break; + case 11: + return iunpackselect11(initOffset, in, slot); + break; - case 11: return iunpackselect11(initOffset, in, slot); break; + case 12: + return iunpackselect12(initOffset, in, slot); + break; - case 12: return iunpackselect12(initOffset, in, slot); break; + case 13: + return iunpackselect13(initOffset, in, slot); + break; - case 13: return iunpackselect13(initOffset, in, slot); break; + case 14: + return iunpackselect14(initOffset, in, slot); + break; - case 14: return iunpackselect14(initOffset, in, slot); break; + case 15: + return iunpackselect15(initOffset, in, slot); + break; - case 15: return iunpackselect15(initOffset, in, slot); break; + case 16: + return iunpackselect16(initOffset, in, slot); + break; - case 16: return iunpackselect16(initOffset, in, slot); break; + case 17: + return iunpackselect17(initOffset, in, slot); + break; - case 17: return iunpackselect17(initOffset, in, slot); break; + case 18: + return iunpackselect18(initOffset, in, slot); + break; - case 18: return iunpackselect18(initOffset, in, slot); break; + case 19: + return iunpackselect19(initOffset, in, slot); + break; - case 19: return iunpackselect19(initOffset, in, slot); break; + case 20: + return iunpackselect20(initOffset, in, slot); + break; - case 20: return iunpackselect20(initOffset, in, slot); break; + case 21: + return iunpackselect21(initOffset, in, slot); + break; - case 21: return iunpackselect21(initOffset, in, slot); break; + case 22: + return iunpackselect22(initOffset, in, slot); + break; - case 22: return iunpackselect22(initOffset, in, slot); break; + case 23: + return iunpackselect23(initOffset, in, slot); + break; - case 23: return iunpackselect23(initOffset, in, slot); break; + case 24: + return iunpackselect24(initOffset, in, slot); + break; - case 24: return iunpackselect24(initOffset, in, slot); break; + case 25: + return iunpackselect25(initOffset, in, slot); + break; - case 25: return iunpackselect25(initOffset, in, slot); break; + case 26: + return iunpackselect26(initOffset, in, slot); + break; - case 26: return iunpackselect26(initOffset, in, slot); break; + case 27: + return iunpackselect27(initOffset, in, slot); + break; - case 27: return iunpackselect27(initOffset, in, slot); break; + case 28: + return iunpackselect28(initOffset, in, slot); + break; - case 28: return iunpackselect28(initOffset, in, slot); break; + case 29: + return iunpackselect29(initOffset, in, slot); + break; - case 29: return iunpackselect29(initOffset, in, slot); break; + case 30: + return iunpackselect30(initOffset, in, slot); + break; - case 30: return iunpackselect30(initOffset, in, slot); break; + case 31: + return iunpackselect31(initOffset, in, slot); + break; - case 31: return iunpackselect31(initOffset, in, slot); break; + case 32: + return iunpackselect32(initOffset, in, slot); + break; - case 32: return iunpackselect32(initOffset, in, slot); break; + default: + break; + } - default: break; - } - - return (-1); + return (-1); } -static void -iunpackscan1(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan1(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<1)-1); + __m128i mask = _mm_set1_epi32((1U << 1) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan2(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan2(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<2)-1); + __m128i mask = _mm_set1_epi32((1U << 2) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan3(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan3(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<3)-1); + __m128i mask = _mm_set1_epi32((1U << 3) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan4(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan4(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<4)-1); + __m128i mask = _mm_set1_epi32((1U << 4) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan5(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan5(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<5)-1); + __m128i mask = _mm_set1_epi32((1U << 5) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan6(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan6(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<6)-1); + __m128i mask = _mm_set1_epi32((1U << 6) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan7(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan7(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<7)-1); + __m128i mask = _mm_set1_epi32((1U << 7) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan8(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan8(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<8)-1); + __m128i mask = _mm_set1_epi32((1U << 8) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan9(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan9(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<9)-1); + __m128i mask = _mm_set1_epi32((1U << 9) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan10(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan10(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<10)-1); + __m128i mask = _mm_set1_epi32((1U << 10) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan11(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan11(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<11)-1); + __m128i mask = _mm_set1_epi32((1U << 11) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan12(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan12(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<12)-1); + __m128i mask = _mm_set1_epi32((1U << 12) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan13(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan13(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<13)-1); + __m128i mask = _mm_set1_epi32((1U << 13) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan14(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan14(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<14)-1); + __m128i mask = _mm_set1_epi32((1U << 14) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan15(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan15(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<15)-1); + __m128i mask = _mm_set1_epi32((1U << 15) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan16(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan16(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<16)-1); + __m128i mask = _mm_set1_epi32((1U << 16) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan17(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan17(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<17)-1); + __m128i mask = _mm_set1_epi32((1U << 17) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan18(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan18(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<18)-1); + __m128i mask = _mm_set1_epi32((1U << 18) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan19(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan19(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<19)-1); + __m128i mask = _mm_set1_epi32((1U << 19) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - } -static void -iunpackscan20(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan20(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<20)-1); + __m128i mask = _mm_set1_epi32((1U << 20) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan21(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan21(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<21)-1); + __m128i mask = _mm_set1_epi32((1U << 21) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan22(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan22(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<22)-1); + __m128i mask = _mm_set1_epi32((1U << 22) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan23(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan23(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<23)-1); + __m128i mask = _mm_set1_epi32((1U << 23) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan24(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan24(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<24)-1); + __m128i mask = _mm_set1_epi32((1U << 24) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan25(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan25(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<25)-1); + __m128i mask = _mm_set1_epi32((1U << 25) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan26(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan26(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<26)-1); + __m128i mask = _mm_set1_epi32((1U << 26) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan27(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan27(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<27)-1); + __m128i mask = _mm_set1_epi32((1U << 27) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan28(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan28(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<28)-1); + __m128i mask = _mm_set1_epi32((1U << 28) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan29(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan29(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<29)-1); + __m128i mask = _mm_set1_epi32((1U << 29) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan30(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan30(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<30)-1); + __m128i mask = _mm_set1_epi32((1U << 30) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; - ++in; InReg = _mm_loadu_si128(in); + ++in; + InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; - tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - - - } -static void -iunpackscan31(__m128i * initOffset, const __m128i *in) -{ +static void iunpackscan31(__m128i *initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<31)-1); + __m128i mask = _mm_set1_epi32((1U << 31) - 1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,31); + tmp = _mm_srli_epi32(InReg, 31); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,30); + tmp = _mm_srli_epi32(InReg, 30); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,29); + tmp = _mm_srli_epi32(InReg, 29); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,28); + tmp = _mm_srli_epi32(InReg, 28); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,27); + tmp = _mm_srli_epi32(InReg, 27); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,26); + tmp = _mm_srli_epi32(InReg, 26); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,25); + tmp = _mm_srli_epi32(InReg, 25); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,24); + tmp = _mm_srli_epi32(InReg, 24); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,23); + tmp = _mm_srli_epi32(InReg, 23); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,22); + tmp = _mm_srli_epi32(InReg, 22); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,21); + tmp = _mm_srli_epi32(InReg, 21); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,20); + tmp = _mm_srli_epi32(InReg, 20); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,19); + tmp = _mm_srli_epi32(InReg, 19); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,18); + tmp = _mm_srli_epi32(InReg, 18); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,17); + tmp = _mm_srli_epi32(InReg, 17); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,16); + tmp = _mm_srli_epi32(InReg, 16); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,15); + tmp = _mm_srli_epi32(InReg, 15); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,14); + tmp = _mm_srli_epi32(InReg, 14); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,13); + tmp = _mm_srli_epi32(InReg, 13); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,12); + tmp = _mm_srli_epi32(InReg, 12); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,11); + tmp = _mm_srli_epi32(InReg, 11); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,10); + tmp = _mm_srli_epi32(InReg, 10); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,9); + tmp = _mm_srli_epi32(InReg, 9); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,8); + tmp = _mm_srli_epi32(InReg, 8); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,7); + tmp = _mm_srli_epi32(InReg, 7); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,6); + tmp = _mm_srli_epi32(InReg, 6); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,5); + tmp = _mm_srli_epi32(InReg, 5); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,4); + tmp = _mm_srli_epi32(InReg, 4); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,3); + tmp = _mm_srli_epi32(InReg, 3); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,2); + tmp = _mm_srli_epi32(InReg, 2); out = tmp; - ++in; InReg = _mm_loadu_si128(in); - out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + ++in; + InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; - - tmp = _mm_srli_epi32(InReg,1); + tmp = _mm_srli_epi32(InReg, 1); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; - - } -static void -iunpackscan32(__m128i * initOffset , const __m128i *in) -{ - *initOffset = MM_LOAD_SI_128(in + 31); +static void iunpackscan32(__m128i *initOffset, const __m128i *in) { + *initOffset = MM_LOAD_SI_128(in + 31); } +void simdscand1(__m128i *initOffset, const __m128i *in, uint32_t bit) { + switch (bit) { + case 0: + return; + break; -void -simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit) -{ - switch (bit) { - case 0: return; break; - - case 1: iunpackscan1(initOffset, in); break; + case 1: + iunpackscan1(initOffset, in); + break; - case 2: iunpackscan2(initOffset, in); break; + case 2: + iunpackscan2(initOffset, in); + break; - case 3: iunpackscan3(initOffset, in); break; + case 3: + iunpackscan3(initOffset, in); + break; - case 4: iunpackscan4(initOffset, in); break; + case 4: + iunpackscan4(initOffset, in); + break; - case 5: iunpackscan5(initOffset, in); break; + case 5: + iunpackscan5(initOffset, in); + break; - case 6: iunpackscan6(initOffset, in); break; + case 6: + iunpackscan6(initOffset, in); + break; - case 7: iunpackscan7(initOffset, in); break; + case 7: + iunpackscan7(initOffset, in); + break; - case 8: iunpackscan8(initOffset, in); break; + case 8: + iunpackscan8(initOffset, in); + break; - case 9: iunpackscan9(initOffset, in); break; + case 9: + iunpackscan9(initOffset, in); + break; - case 10: iunpackscan10(initOffset, in); break; + case 10: + iunpackscan10(initOffset, in); + break; - case 11: iunpackscan11(initOffset, in); break; + case 11: + iunpackscan11(initOffset, in); + break; - case 12: iunpackscan12(initOffset, in); break; + case 12: + iunpackscan12(initOffset, in); + break; - case 13: iunpackscan13(initOffset, in); break; + case 13: + iunpackscan13(initOffset, in); + break; - case 14: iunpackscan14(initOffset, in); break; + case 14: + iunpackscan14(initOffset, in); + break; - case 15: iunpackscan15(initOffset, in); break; + case 15: + iunpackscan15(initOffset, in); + break; - case 16: iunpackscan16(initOffset, in); break; + case 16: + iunpackscan16(initOffset, in); + break; - case 17: iunpackscan17(initOffset, in); break; + case 17: + iunpackscan17(initOffset, in); + break; - case 18: iunpackscan18(initOffset, in); break; + case 18: + iunpackscan18(initOffset, in); + break; - case 19: iunpackscan19(initOffset, in); break; + case 19: + iunpackscan19(initOffset, in); + break; - case 20: iunpackscan20(initOffset, in); break; + case 20: + iunpackscan20(initOffset, in); + break; - case 21: iunpackscan21(initOffset, in); break; + case 21: + iunpackscan21(initOffset, in); + break; - case 22: iunpackscan22(initOffset, in); break; + case 22: + iunpackscan22(initOffset, in); + break; - case 23: iunpackscan23(initOffset, in); break; + case 23: + iunpackscan23(initOffset, in); + break; - case 24: iunpackscan24(initOffset, in); break; + case 24: + iunpackscan24(initOffset, in); + break; - case 25: iunpackscan25(initOffset, in); break; + case 25: + iunpackscan25(initOffset, in); + break; - case 26: iunpackscan26(initOffset, in); break; + case 26: + iunpackscan26(initOffset, in); + break; - case 27: iunpackscan27(initOffset, in); break; + case 27: + iunpackscan27(initOffset, in); + break; - case 28: iunpackscan28(initOffset, in); break; + case 28: + iunpackscan28(initOffset, in); + break; - case 29: iunpackscan29(initOffset, in); break; + case 29: + iunpackscan29(initOffset, in); + break; - case 30: iunpackscan30(initOffset, in); break; + case 30: + iunpackscan30(initOffset, in); + break; - case 31: iunpackscan31(initOffset, in); break; + case 31: + iunpackscan31(initOffset, in); + break; - case 32: iunpackscan32(initOffset, in); break; + case 32: + iunpackscan32(initOffset, in); + break; - default: break; - } + default: + break; + } - return ; + return; } diff --git a/src/streamvbyte.c b/src/streamvbyte.c index 044eb11..e8e08c3 100644 --- a/src/streamvbyte.c +++ b/src/streamvbyte.c @@ -1,7 +1,7 @@ // StreamVByte data format: // [count] [count * 2-bits per key] [count * vbyte ints] // (out) | (key)-> | (data)-> | -// 4 bytes | (count+3)/4 bytes | max of count * 4B | +// 4 bytes | (count+3)/4 bytes | max of count * 4B | // each 8-bit key has four 2-bit lengths: 00=1B, 01=2B, 10=3B, 11=4B // no particular alignment is assumed or guaranteed for any elements @@ -15,7 +15,6 @@ #include #include "platform.h" - typedef __m128i xmm_t; #ifdef __AVX2__ @@ -24,7 +23,7 @@ typedef __m256i ymm_t; #define XMM(y) _mm256_castsi256_si128(y) #endif // __AVX2__ -#ifdef IACA +#ifdef IACA #include #else // not IACA #undef IACA_START @@ -34,916 +33,906 @@ typedef __m256i ymm_t; #endif // not IACA static inline uint8_t _encoded_length(uint32_t val) { - if (val < (1 << 8)) { - return 1; - } - if (val < (1 << 16)) { - return 2; - } - if (val < (1 << 24)) { - return 3; - } - return 4; + if (val < (1 << 8)) { + return 1; + } + if (val < (1 << 16)) { + return 2; + } + if (val < (1 << 24)) { + return 3; + } + return 4; } -static inline uint8_t _encode_data(uint32_t val, uint8_t * __restrict__ *dataPtrPtr) { - uint8_t *dataPtr = *dataPtrPtr; - uint8_t code; - - if (val < (1 << 8)) { // 1 byte - *dataPtr = (uint8_t)(val); - *dataPtrPtr += 1; - code = 0; - } else if (val < (1 << 16)) { // 2 bytes - *(uint16_t *)dataPtr = (uint16_t)(val); - *dataPtrPtr += 2; - code = 1; - } else if (val < (1 << 24)) { // 3 bytes - *(uint16_t *)dataPtr = (uint16_t)(val); - *(dataPtr + 2) = (uint8_t)(val >> 16); - *dataPtrPtr += 3; - code = 2; - } else { // 4 bytes - *(uint32_t *)dataPtr = val; - *dataPtrPtr += 4; - code = 3; - } - - return code; +static inline uint8_t _encode_data(uint32_t val, + uint8_t *__restrict__ *dataPtrPtr) { + uint8_t *dataPtr = *dataPtrPtr; + uint8_t code; + + if (val < (1 << 8)) { // 1 byte + *dataPtr = (uint8_t)(val); + *dataPtrPtr += 1; + code = 0; + } else if (val < (1 << 16)) { // 2 bytes + *(uint16_t *)dataPtr = (uint16_t)(val); + *dataPtrPtr += 2; + code = 1; + } else if (val < (1 << 24)) { // 3 bytes + *(uint16_t *)dataPtr = (uint16_t)(val); + *(dataPtr + 2) = (uint8_t)(val >> 16); + *dataPtrPtr += 3; + code = 2; + } else { // 4 bytes + *(uint32_t *)dataPtr = val; + *dataPtrPtr += 4; + code = 3; + } + + return code; } -uint8_t *svb_encode_scalar_d1_init(const uint32_t *in, uint8_t * __restrict__ keyPtr, - uint8_t * __restrict__ dataPtr, uint32_t count, - uint32_t prev) { - if (count == 0) return dataPtr; // exit immediately if no data - - uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ... - uint8_t key = 0; - for (uint32_t c = 0; c < count; c++) { - if (shift == 8) { - shift = 0; - *keyPtr++ = key; - key = 0; - } - uint32_t val = in[c] - prev; - prev = in[c]; - uint8_t code = _encode_data(val, &dataPtr); - key |= code << shift; - shift += 2; +uint8_t *svb_encode_scalar_d1_init(const uint32_t *in, + uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, + uint32_t count, uint32_t prev) { + if (count == 0) + return dataPtr; // exit immediately if no data + + uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ... + uint8_t key = 0; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + *keyPtr++ = key; + key = 0; } + uint32_t val = in[c] - prev; + prev = in[c]; + uint8_t code = _encode_data(val, &dataPtr); + key |= code << shift; + shift += 2; + } - *keyPtr = key; // write last key (no increment needed) - return dataPtr; // pointer to first unused data byte + *keyPtr = key; // write last key (no increment needed) + return dataPtr; // pointer to first unused data byte } -uint8_t *svb_encode_scalar_d1(const uint32_t *in, uint8_t * __restrict__ keyPtr, - uint8_t * __restrict__ dataPtr, uint32_t count) { - return svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, 0); +uint8_t *svb_encode_scalar_d1(const uint32_t *in, uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, uint32_t count) { + return svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, 0); } -uint8_t *svb_encode_scalar(const uint32_t *in, uint8_t * __restrict__ keyPtr, - uint8_t * __restrict__ dataPtr, uint32_t count) { - if (count == 0) return dataPtr; // exit immediately if no data - - uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ... - uint8_t key = 0; - for (uint32_t c = 0; c < count; c++) { - if (shift == 8) { - shift = 0; - *keyPtr++ = key; - key = 0; - } - uint32_t val = in[c]; - uint8_t code = _encode_data(val, &dataPtr); - key |= code << shift; - shift += 2; +uint8_t *svb_encode_scalar(const uint32_t *in, uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, uint32_t count) { + if (count == 0) + return dataPtr; // exit immediately if no data + + uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ... + uint8_t key = 0; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + *keyPtr++ = key; + key = 0; } - - *keyPtr = key; // write last key (no increment needed) - return dataPtr; // pointer to first unused data byte + uint32_t val = in[c]; + uint8_t code = _encode_data(val, &dataPtr); + key |= code << shift; + shift += 2; + } + + *keyPtr = key; // write last key (no increment needed) + return dataPtr; // pointer to first unused data byte } static inline uint32_t _decode_data(uint8_t **dataPtrPtr, uint8_t code) { - uint8_t *dataPtr = *dataPtrPtr; - uint32_t val; - - if (code == 0) { // 1 byte - val = (uint32_t) *dataPtr; - dataPtr += 1; - } else if (code == 1) { // 2 bytes - val = (uint32_t) *(uint16_t *)dataPtr; - dataPtr += 2; - } else if (code == 2) { // 3 bytes - val = (uint32_t) *(uint16_t *)dataPtr; - val |= *(dataPtr + 2) << 16; - dataPtr += 3; - } else { // code == 3 - val = *(uint32_t *) dataPtr; // 4 bytes - dataPtr += 4; - } - - *dataPtrPtr = dataPtr; - return val; + uint8_t *dataPtr = *dataPtrPtr; + uint32_t val; + + if (code == 0) { // 1 byte + val = (uint32_t)*dataPtr; + dataPtr += 1; + } else if (code == 1) { // 2 bytes + val = (uint32_t) * (uint16_t *)dataPtr; + dataPtr += 2; + } else if (code == 2) { // 3 bytes + val = (uint32_t) * (uint16_t *)dataPtr; + val |= *(dataPtr + 2) << 16; + dataPtr += 3; + } else { // code == 3 + val = *(uint32_t *)dataPtr; // 4 bytes + dataPtr += 4; + } + + *dataPtrPtr = dataPtr; + return val; } uint8_t *svb_append_scalar_d1(uint8_t *keyPtr, uint8_t *dataPtr, - size_t sizebytes, size_t count, uint32_t delta) -{ - uint32_t keyLen = (uint32_t)(count + 3) / 4; // 2-bits rounded to full byte - - // make space for additional keys? - if (count >= keyLen * 4) { - memmove(dataPtr + 1, dataPtr, sizebytes - keyLen); - *dataPtr = 0; - dataPtr++; - sizebytes++; - keyLen++; - } - - keyPtr += count / 4; - dataPtr += sizebytes - keyLen; - int shift = (count % 4) * 2; - - uint8_t code = _encode_data(delta, &dataPtr); - *keyPtr |= code << shift; - return dataPtr; + size_t sizebytes, size_t count, uint32_t delta) { + uint32_t keyLen = (uint32_t)(count + 3) / 4; // 2-bits rounded to full byte + + // make space for additional keys? + if (count >= keyLen * 4) { + memmove(dataPtr + 1, dataPtr, sizebytes - keyLen); + *dataPtr = 0; + dataPtr++; + sizebytes++; + keyLen++; + } + + keyPtr += count / 4; + dataPtr += sizebytes - keyLen; + int shift = (count % 4) * 2; + + uint8_t code = _encode_data(delta, &dataPtr); + *keyPtr |= code << shift; + return dataPtr; } uint8_t *svb_insert_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, - size_t dataSize, uint32_t count, - uint32_t prev, uint32_t new_key, - uint32_t *position) { - if (count == 0) { - *position = 0; - return svb_encode_scalar_d1_init(&new_key, keyPtr, dataPtr, 1, prev); - } + size_t dataSize, uint32_t count, + uint32_t prev, uint32_t new_key, + uint32_t *position) { + if (count == 0) { + *position = 0; + return svb_encode_scalar_d1_init(&new_key, keyPtr, dataPtr, 1, prev); + } - uint8_t shift = 0; - uint32_t key = *keyPtr; - uint8_t *dataPtrBegin = dataPtr; + uint8_t shift = 0; + uint32_t key = *keyPtr; + uint8_t *dataPtrBegin = dataPtr; - for (uint32_t c = 0; c < count; c++) { - uint8_t *dataPtrPrev = dataPtr; + for (uint32_t c = 0; c < count; c++) { + uint8_t *dataPtrPrev = dataPtr; - if (shift == 8) { - shift = 0; - key = *(++keyPtr); - } - - uint8_t current_key_code = (key >> shift) & 0x3; - uint32_t current_key = prev + _decode_data(&dataPtr, current_key_code); - if (current_key >= new_key) { - // rewind to the insertion position - dataPtr = dataPtrPrev; - - // shift keys 2 bits "to the right" - uint32_t mask_hi = key & (~0 << shift); - uint32_t mask_lo = key & ((1 << shift) - 1); - key = (mask_hi << 2) | mask_lo; - uint32_t carry_bits, prev_carry_bits = (key & (3 << 8)) >> 8; - - for (uint8_t *p = keyPtr + 1; p < dataPtrBegin; p++) { - carry_bits = (*p & (3 << 6)) >> 6; - *p <<= 2; - *p |= prev_carry_bits; - prev_carry_bits = carry_bits; - } - - // shift values n bytes "to the right", then insert the new key - // and the updated delta of the next key - int gap = _encoded_length(new_key - prev) - + _encoded_length(current_key - new_key) - - (current_key_code + 1); - assert(gap >= 0); - if (gap > 0) - memmove(dataPtr + gap, dataPtr, - dataSize - (dataPtr - dataPtrBegin)); - - // first insert the new key - uint8_t code = _encode_data(new_key - prev, &dataPtr); - *keyPtr = key | (code << shift); - - // then update the current key - shift += 2; - if (shift == 8) { - shift = 0; - keyPtr++; - } - code = _encode_data(current_key - new_key, &dataPtr); - *keyPtr &= ~(3 << shift); - *keyPtr |= (code << shift); - - *position = c; - return dataPtrBegin + dataSize + gap; - } - - prev = current_key; - shift += 2; + if (shift == 8) { + shift = 0; + key = *(++keyPtr); } - // append the new key at the end - if (shift == 8) { + uint8_t current_key_code = (key >> shift) & 0x3; + uint32_t current_key = prev + _decode_data(&dataPtr, current_key_code); + if (current_key >= new_key) { + // rewind to the insertion position + dataPtr = dataPtrPrev; + + // shift keys 2 bits "to the right" + uint32_t mask_hi = key & (~0 << shift); + uint32_t mask_lo = key & ((1 << shift) - 1); + key = (mask_hi << 2) | mask_lo; + uint32_t carry_bits, prev_carry_bits = (key & (3 << 8)) >> 8; + + for (uint8_t *p = keyPtr + 1; p < dataPtrBegin; p++) { + carry_bits = (*p & (3 << 6)) >> 6; + *p <<= 2; + *p |= prev_carry_bits; + prev_carry_bits = carry_bits; + } + + // shift values n bytes "to the right", then insert the new key + // and the updated delta of the next key + int gap = _encoded_length(new_key - prev) + + _encoded_length(current_key - new_key) - (current_key_code + 1); + assert(gap >= 0); + if (gap > 0) + memmove(dataPtr + gap, dataPtr, dataSize - (dataPtr - dataPtrBegin)); + + // first insert the new key + uint8_t code = _encode_data(new_key - prev, &dataPtr); + *keyPtr = key | (code << shift); + + // then update the current key + shift += 2; + if (shift == 8) { shift = 0; keyPtr++; + } + code = _encode_data(current_key - new_key, &dataPtr); + *keyPtr &= ~(3 << shift); + *keyPtr |= (code << shift); + + *position = c; + return dataPtrBegin + dataSize + gap; } - uint8_t code = _encode_data(new_key - prev, &dataPtr); - key &= ~(3 << shift); - key |= code << shift; - *keyPtr = key; // write last key (no increment needed) + prev = current_key; + shift += 2; + } - *position = count; - return dataPtrBegin + dataSize + code + 1; -} + // append the new key at the end + if (shift == 8) { + shift = 0; + keyPtr++; + } + uint8_t code = _encode_data(new_key - prev, &dataPtr); + key &= ~(3 << shift); + key |= code << shift; + *keyPtr = key; // write last key (no increment needed) + + *position = count; + return dataPtrBegin + dataSize + code + 1; +} uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, const uint8_t *keyPtr, - uint8_t *dataPtr, uint32_t count, uint32_t prev) { - if (count == 0) return dataPtr; // no reads or writes if no data + uint8_t *dataPtr, uint32_t count, + uint32_t prev) { + if (count == 0) + return dataPtr; // no reads or writes if no data - uint8_t shift = 0; - uint32_t key = *keyPtr++; + uint8_t shift = 0; + uint32_t key = *keyPtr++; - for (uint32_t c = 0; c < count; c++) { - if (shift == 8) { - shift = 0; - key = *keyPtr++; - } - uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3); - val += prev; - *outPtr++ = val; - prev = val; - shift += 2; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + key = *keyPtr++; } - - return dataPtr; // pointer to first unused byte after end + uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3); + val += prev; + *outPtr++ = val; + prev = val; + shift += 2; + } + + return dataPtr; // pointer to first unused byte after end } - - uint8_t *svb_decode_scalar_d1(uint32_t *outPtr, const uint8_t *keyPtr, uint8_t *dataPtr, uint32_t count) { - return svb_decode_scalar_d1_init(outPtr, keyPtr, dataPtr, count, 0); + return svb_decode_scalar_d1_init(outPtr, keyPtr, dataPtr, count, 0); } -uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr, +uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr, uint8_t *dataPtr, uint32_t count) { - if (count == 0) return dataPtr; // no reads or writes if no data - - uint8_t shift = 0; - uint32_t key = *keyPtr++; - for (uint32_t c = 0; c < count; c++) { - if (shift == 8) { - shift = 0; - key = *keyPtr++; - } - uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3); - *outPtr++ = val; - shift += 2; + if (count == 0) + return dataPtr; // no reads or writes if no data + + uint8_t shift = 0; + uint32_t key = *keyPtr++; + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + key = *keyPtr++; } + uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3); + *outPtr++ = val; + shift += 2; + } - return dataPtr; // pointer to first unused byte after end + return dataPtr; // pointer to first unused byte after end } static uint8_t lengthTable[256] = { - 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, - 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, - 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, - 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, - 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, - 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, - 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, - 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, - 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, - 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, - 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, - 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, - 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, - 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16 -}; + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6, 7, + 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, + 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, + 10, 11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, + 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, + 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, + 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, + 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, + 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, + 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, + 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, + 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, + 14, 12, 13, 14, 15, 13, 14, 15, 16}; static uint8_t shuffleTable[256][16] = { - { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1 }, // 1111 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 2111 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 3111 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 4111 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1211 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2211 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3211 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 4211 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 1311 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 2311 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3311 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 }, // 4311 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1411 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2411 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 }, // 3411 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 4411 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 1121 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 2121 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 3121 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 4121 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1221 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2221 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3221 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 4221 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 1321 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 2321 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3321 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 }, // 4321 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1421 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2421 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 }, // 3421 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 4421 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 }, // 1131 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 2131 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 3131 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 4131 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 1231 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 2231 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 3231 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 4231 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 1331 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 2331 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 3331 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 }, // 4331 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 }, // 1431 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 }, // 2431 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 }, // 3431 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 }, // 4431 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 1141 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 2141 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 3141 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 4141 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1241 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2241 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3241 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 4241 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 1341 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 2341 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3341 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 }, // 4341 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1441 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2441 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 }, // 3441 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 4441 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 1112 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 2112 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 3112 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 4112 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1212 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2212 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3212 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 4212 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 1312 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 2312 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3312 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 }, // 4312 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1412 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2412 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 }, // 3412 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 4412 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 1122 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 2122 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 3122 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 4122 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1222 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2222 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3222 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 4222 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 1322 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 2322 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3322 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 }, // 4322 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1422 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2422 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 }, // 3422 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 4422 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 }, // 1132 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 2132 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 3132 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 4132 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 1232 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 2232 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 3232 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 4232 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 1332 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 2332 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 3332 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 }, // 4332 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 }, // 1432 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 }, // 2432 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 }, // 3432 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 }, // 4432 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 1142 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 2142 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 3142 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 4142 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1242 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2242 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3242 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 4242 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 1342 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 2342 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3342 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 }, // 4342 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1442 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2442 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 }, // 3442 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 4442 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 }, // 1113 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 2113 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 3113 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 4113 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 1213 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 2213 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 3213 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 4213 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 1313 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 2313 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 3313 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 }, // 4313 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 }, // 1413 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 }, // 2413 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 }, // 3413 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 }, // 4413 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 }, // 1123 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 2123 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 3123 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 4123 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 1223 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 2223 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 3223 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 4223 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 1323 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 2323 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 3323 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 }, // 4323 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 }, // 1423 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 }, // 2423 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 }, // 3423 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 }, // 4423 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 }, // 1133 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 2133 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 3133 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 4133 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 1233 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 2233 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 3233 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 4233 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 1333 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 2333 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 3333 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 }, // 4333 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 }, // 1433 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 }, // 2433 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 }, // 3433 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 }, // 4433 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 }, // 1143 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 2143 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 3143 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 4143 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 1243 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 2243 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 3243 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 4243 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 1343 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 2343 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 3343 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 }, // 4343 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 }, // 1443 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 }, // 2443 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 }, // 3443 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 4443 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 1114 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 2114 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 3114 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 4114 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1214 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2214 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3214 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 4214 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 1314 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 2314 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3314 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 }, // 4314 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1414 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2414 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 }, // 3414 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 4414 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 1124 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 2124 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 3124 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 4124 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1224 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2224 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3224 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 4224 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 1324 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 2324 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3324 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 }, // 4324 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1424 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2424 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 }, // 3424 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 4424 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 }, // 1134 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 2134 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 3134 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 4134 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 1234 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 2234 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 3234 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 4234 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 1334 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 2334 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 3334 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 }, // 4334 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 }, // 1434 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 }, // 2434 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 }, // 3434 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 }, // 4434 - { 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 1144 - { 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 2144 - { 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 3144 - { 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 4144 - { 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1244 - { 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2244 - { 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3244 - { 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 4244 - { 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 1344 - { 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 2344 - { 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3344 - { 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 }, // 4344 - { 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1444 - { 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2444 - { 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, // 3444 - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } // 4444 + {0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1}, // 1111 + {0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1}, // 2111 + {0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1}, // 3111 + {0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1}, // 4111 + {0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1}, // 1211 + {0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1}, // 2211 + {0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1}, // 3211 + {0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1}, // 4211 + {0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1}, // 1311 + {0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1}, // 2311 + {0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1}, // 3311 + {0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1}, // 4311 + {0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1}, // 1411 + {0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1}, // 2411 + {0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1}, // 3411 + {0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1}, // 4411 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1}, // 1121 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1}, // 2121 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1}, // 3121 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1}, // 4121 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1}, // 1221 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1}, // 2221 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1}, // 3221 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1}, // 4221 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1}, // 1321 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1}, // 2321 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1}, // 3321 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1}, // 4321 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1}, // 1421 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1}, // 2421 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1}, // 3421 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1}, // 4421 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1}, // 1131 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1}, // 2131 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1}, // 3131 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1}, // 4131 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1}, // 1231 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1}, // 2231 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1}, // 3231 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1}, // 4231 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1}, // 1331 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1}, // 2331 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1}, // 3331 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1}, // 4331 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1}, // 1431 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1}, // 2431 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1}, // 3431 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1}, // 4431 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1}, // 1141 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1}, // 2141 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1}, // 3141 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1}, // 4141 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1}, // 1241 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1}, // 2241 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1}, // 3241 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1}, // 4241 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1}, // 1341 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1}, // 2341 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1}, // 3341 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1}, // 4341 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1}, // 1441 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1}, // 2441 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1}, // 3441 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1}, // 4441 + {0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1}, // 1112 + {0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1}, // 2112 + {0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1}, // 3112 + {0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1}, // 4112 + {0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1}, // 1212 + {0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1}, // 2212 + {0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1}, // 3212 + {0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1}, // 4212 + {0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1}, // 1312 + {0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1}, // 2312 + {0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1}, // 3312 + {0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1}, // 4312 + {0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1}, // 1412 + {0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1}, // 2412 + {0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1}, // 3412 + {0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1}, // 4412 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1}, // 1122 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1}, // 2122 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1}, // 3122 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1}, // 4122 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1}, // 1222 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1}, // 2222 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1}, // 3222 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1}, // 4222 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1}, // 1322 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1}, // 2322 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1}, // 3322 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1}, // 4322 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1}, // 1422 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1}, // 2422 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1}, // 3422 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1}, // 4422 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1}, // 1132 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1}, // 2132 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1}, // 3132 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1}, // 4132 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1}, // 1232 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1}, // 2232 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1}, // 3232 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1}, // 4232 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1}, // 1332 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1}, // 2332 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1}, // 3332 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1}, // 4332 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1}, // 1432 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1}, // 2432 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1}, // 3432 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1}, // 4432 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1}, // 1142 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1}, // 2142 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1}, // 3142 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1}, // 4142 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1}, // 1242 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1}, // 2242 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1}, // 3242 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1}, // 4242 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1}, // 1342 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1}, // 2342 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1}, // 3342 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1}, // 4342 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1}, // 1442 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1}, // 2442 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1}, // 3442 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1}, // 4442 + {0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1}, // 1113 + {0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1}, // 2113 + {0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1}, // 3113 + {0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1}, // 4113 + {0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1}, // 1213 + {0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1}, // 2213 + {0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1}, // 3213 + {0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1}, // 4213 + {0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1}, // 1313 + {0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1}, // 2313 + {0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1}, // 3313 + {0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1}, // 4313 + {0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1}, // 1413 + {0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1}, // 2413 + {0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1}, // 3413 + {0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1}, // 4413 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1}, // 1123 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1}, // 2123 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1}, // 3123 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1}, // 4123 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1}, // 1223 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1}, // 2223 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1}, // 3223 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1}, // 4223 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1}, // 1323 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1}, // 2323 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1}, // 3323 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1}, // 4323 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1}, // 1423 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1}, // 2423 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1}, // 3423 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1}, // 4423 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1}, // 1133 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1}, // 2133 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1}, // 3133 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1}, // 4133 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1}, // 1233 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1}, // 2233 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1}, // 3233 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1}, // 4233 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1}, // 1333 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1}, // 2333 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1}, // 3333 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1}, // 4333 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1}, // 1433 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1}, // 2433 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1}, // 3433 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1}, // 4433 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1}, // 1143 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1}, // 2143 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1}, // 3143 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1}, // 4143 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1}, // 1243 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1}, // 2243 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1}, // 3243 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1}, // 4243 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1}, // 1343 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1}, // 2343 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1}, // 3343 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1}, // 4343 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1}, // 1443 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1}, // 2443 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1}, // 3443 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1}, // 4443 + {0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6}, // 1114 + {0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7}, // 2114 + {0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8}, // 3114 + {0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9}, // 4114 + {0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7}, // 1214 + {0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8}, // 2214 + {0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9}, // 3214 + {0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10}, // 4214 + {0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8}, // 1314 + {0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9}, // 2314 + {0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10}, // 3314 + {0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11}, // 4314 + {0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9}, // 1414 + {0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10}, // 2414 + {0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11}, // 3414 + {0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12}, // 4414 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7}, // 1124 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8}, // 2124 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9}, // 3124 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10}, // 4124 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8}, // 1224 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9}, // 2224 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10}, // 3224 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11}, // 4224 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9}, // 1324 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10}, // 2324 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11}, // 3324 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12}, // 4324 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10}, // 1424 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11}, // 2424 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12}, // 3424 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13}, // 4424 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8}, // 1134 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9}, // 2134 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10}, // 3134 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11}, // 4134 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9}, // 1234 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10}, // 2234 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11}, // 3234 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12}, // 4234 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10}, // 1334 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11}, // 2334 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12}, // 3334 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13}, // 4334 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11}, // 1434 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12}, // 2434 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13}, // 3434 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14}, // 4434 + {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9}, // 1144 + {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10}, // 2144 + {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11}, // 3144 + {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12}, // 4144 + {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10}, // 1244 + {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11}, // 2244 + {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12}, // 3244 + {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13}, // 4244 + {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11}, // 1344 + {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12}, // 2344 + {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13}, // 3344 + {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14}, // 4344 + {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, // 1444 + {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, // 2444 + {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, // 3444 + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} // 4444 }; -// static char HighTo32[16] = {8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1}; +// static char HighTo32[16] = {8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, +// 15, -1, -1}; // Byte Order: {0x0706050403020100, 0x0F0E0D0C0B0A0908} #ifndef _MSC_VER static xmm_t High16To32 = {0xFFFF0B0AFFFF0908, 0xFFFF0F0EFFFF0D0C}; #else -static xmm_t High16To32 = {8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1}; +static xmm_t High16To32 = {8, 9, -1, -1, 10, 11, -1, -1, + 12, 13, -1, -1, 14, 15, -1, -1}; #endif - - static inline void _write_avx(uint32_t *out, xmm_t Vec) { - /////////////////// - // was: - // - //_mm_stream_si128((xmm_t *)out, Vec); - // - // Daniel: It is not "fair" to use _mm_stream_si128: _mm_stream_si128 has alignment - // requirements unlike _mm_storeu_si128. In any case, other codecs do not use _mm_stream_si128 - // so any gain specific to _mm_stream_si128 would make comparisons difficult. If the gains - // are really substantial, then that is are result that should be studied separately. - /////////////////// - _mm_storeu_si128((xmm_t *)out, Vec); + /////////////////// + // was: + // + //_mm_stream_si128((xmm_t *)out, Vec); + // + // Daniel: It is not "fair" to use _mm_stream_si128: _mm_stream_si128 has + // alignment + // requirements unlike _mm_storeu_si128. In any case, other codecs do not use + // _mm_stream_si128 + // so any gain specific to _mm_stream_si128 would make comparisons difficult. + // If the gains + // are really substantial, then that is are result that should be studied + // separately. + /////////////////// + _mm_storeu_si128((xmm_t *)out, Vec); } -#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element +#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element static inline xmm_t _write_16bit_avx_d1(uint32_t *out, xmm_t Vec, xmm_t Prev) { - // vec == [A B C D E F G H] (16 bit values) - xmm_t Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G] - Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit) - Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH] - Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF] - Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH] - xmm_t V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit) - V1 = _mm_add_epi32(V1, Prev); // [PA PAB PABC PABCD] (32-bit) - xmm_t V2 = _mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit) - V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit) - _write_avx(out,V1); - _write_avx(out+4,V2); - //_mm_stream_si128((xmm_t *)(out), V1); - //_mm_stream_si128((xmm_t *)(out + 4), V2); - return V2; + // vec == [A B C D E F G H] (16 bit values) + xmm_t Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G] + Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit) + Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH] + Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF] + Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH] + xmm_t V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit) + V1 = _mm_add_epi32(V1, Prev); // [PA PAB PABC PABCD] (32-bit) + xmm_t V2 = + _mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit) + V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit) + _write_avx(out, V1); + _write_avx(out + 4, V2); + //_mm_stream_si128((xmm_t *)(out), V1); + //_mm_stream_si128((xmm_t *)(out + 4), V2); + return V2; } - static inline xmm_t _write_avx_d1(uint32_t *out, xmm_t Vec, xmm_t Prev) { - xmm_t Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done) - Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P] - Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD] - Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB] - Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD] - Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD] - - _write_avx(out,Vec); - //_mm_stream_si128((xmm_t *)out, Vec); - return Vec; + xmm_t Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done) + Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P] + Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD] + Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB] + Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD] + Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD] + + _write_avx(out, Vec); + //_mm_stream_si128((xmm_t *)out, Vec); + return Vec; } // static xmm_t ExtraDelta[256] = { {0x0000000000000000, 0x0000000000000000} }; -static inline xmm_t _decode_avx(uint32_t key, - uint8_t * __restrict__ *dataPtrPtr) { - uint8_t len = lengthTable[key]; - xmm_t Data = _mm_loadu_si128((xmm_t *)*dataPtrPtr); - xmm_t Shuf = *(xmm_t *)&shuffleTable[key]; - - Data = _mm_shuffle_epi8(Data, Shuf); - *dataPtrPtr += len; - - // Appears feasible to have custom delta per key without slowdown - // Data = _mm_add_epi32(Data, ExtraDelta[key]); - - return Data; -} +static inline xmm_t _decode_avx(uint32_t key, + uint8_t *__restrict__ *dataPtrPtr) { + uint8_t len = lengthTable[key]; + xmm_t Data = _mm_loadu_si128((xmm_t *)*dataPtrPtr); + xmm_t Shuf = *(xmm_t *)&shuffleTable[key]; + Data = _mm_shuffle_epi8(Data, Shuf); + *dataPtrPtr += len; + // Appears feasible to have custom delta per key without slowdown + // Data = _mm_add_epi32(Data, ExtraDelta[key]); -uint8_t *svb_decode_avx_d1_init(uint32_t *out, uint8_t * __restrict__ keyPtr, - uint8_t * __restrict__ dataPtr, uint64_t count, - uint32_t prev) { - uint64_t keybytes = count / 4; // number of key bytes - if (keybytes >= 8) { - xmm_t Prev = _mm_set1_epi32(prev); - xmm_t Data; - - int64_t Offset = -(int64_t) keybytes / 8 + 1; - - const uint64_t * keyPtr64 = (const uint64_t *) keyPtr - Offset; - uint64_t nextkeys = keyPtr64[Offset]; - for (; Offset != 0; ++Offset) { - uint64_t keys = nextkeys; - nextkeys = keyPtr64[Offset + 1]; - // faster 16-bit delta since we only have 8-bit values - if (!keys) { // 32 1-byte ints in a row - - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr))); - Prev = _write_16bit_avx_d1(out, Data, Prev); - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 8))); - Prev = _write_16bit_avx_d1(out + 8, Data, Prev); - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 16))); - Prev = _write_16bit_avx_d1(out + 16, Data, Prev); - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 24))); - Prev = _write_16bit_avx_d1(out + 24, Data, Prev); - out += 32; - dataPtr += 32; - continue; - } - - Data = _decode_avx(keys & 0x00FF, &dataPtr); - Prev = _write_avx_d1(out, Data, Prev); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - Prev = _write_avx_d1(out + 4, Data, Prev); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - Prev = _write_avx_d1(out + 8, Data, Prev); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - Prev = _write_avx_d1(out + 12, Data, Prev); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - Prev = _write_avx_d1(out + 16, Data, Prev); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - Prev = _write_avx_d1(out + 20, Data, Prev); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - Prev = _write_avx_d1(out + 24, Data, Prev); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - Prev = _write_avx_d1(out + 28, Data, Prev); - - out += 32; - } - { - uint64_t keys = nextkeys; - // faster 16-bit delta since we only have 8-bit values - if (!keys) { // 32 1-byte ints in a row - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr))); - Prev = _write_16bit_avx_d1(out, Data, Prev); - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 8))); - Prev = _write_16bit_avx_d1(out + 8, Data, Prev); - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 16))); - Prev = _write_16bit_avx_d1(out + 16, Data, Prev); - Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *) (dataPtr + 24))); - Prev = _write_16bit_avx_d1(out + 24, Data, Prev); - out += 32; - dataPtr += 32; - - } else { - - Data = _decode_avx(keys & 0x00FF, &dataPtr); - Prev = _write_avx_d1(out, Data, Prev); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - Prev = _write_avx_d1(out + 4, Data, Prev); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - Prev = _write_avx_d1(out + 8, Data, Prev); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - Prev = _write_avx_d1(out + 12, Data, Prev); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - Prev = _write_avx_d1(out + 16, Data, Prev); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - Prev = _write_avx_d1(out + 20, Data, Prev); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - Prev = _write_avx_d1(out + 24, Data, Prev); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - Prev = _write_avx_d1(out + 28, Data, Prev); - - out += 32; - } - } - prev = out[-1]; - } - uint64_t consumedkeys = keybytes - (keybytes & 7); - return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys,dataPtr, count & 31, prev); - + return Data; } -uint8_t *svb_decode_avx_d1_simple(uint32_t *out, uint8_t * __restrict__ keyPtr, - uint8_t * __restrict__ dataPtr, uint64_t count) { - return svb_decode_avx_d1_init(out, keyPtr, dataPtr, count, 0); +uint8_t *svb_decode_avx_d1_init(uint32_t *out, uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, uint64_t count, + uint32_t prev) { + uint64_t keybytes = count / 4; // number of key bytes + if (keybytes >= 8) { + xmm_t Prev = _mm_set1_epi32(prev); + xmm_t Data; + + int64_t Offset = -(int64_t)keybytes / 8 + 1; + + const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; + uint64_t nextkeys = keyPtr64[Offset]; + for (; Offset != 0; ++Offset) { + uint64_t keys = nextkeys; + nextkeys = keyPtr64[Offset + 1]; + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr))); + Prev = _write_16bit_avx_d1(out, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8))); + Prev = _write_16bit_avx_d1(out + 8, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16))); + Prev = _write_16bit_avx_d1(out + 16, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 24))); + Prev = _write_16bit_avx_d1(out + 24, Data, Prev); + out += 32; + dataPtr += 32; + continue; + } + + Data = _decode_avx(keys & 0x00FF, &dataPtr); + Prev = _write_avx_d1(out, Data, Prev); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + Prev = _write_avx_d1(out + 4, Data, Prev); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + Prev = _write_avx_d1(out + 8, Data, Prev); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + Prev = _write_avx_d1(out + 12, Data, Prev); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + Prev = _write_avx_d1(out + 16, Data, Prev); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + Prev = _write_avx_d1(out + 20, Data, Prev); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + Prev = _write_avx_d1(out + 24, Data, Prev); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + Prev = _write_avx_d1(out + 28, Data, Prev); + + out += 32; + } + { + uint64_t keys = nextkeys; + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr))); + Prev = _write_16bit_avx_d1(out, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8))); + Prev = _write_16bit_avx_d1(out + 8, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16))); + Prev = _write_16bit_avx_d1(out + 16, Data, Prev); + Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *)(dataPtr + 24))); + Prev = _write_16bit_avx_d1(out + 24, Data, Prev); + out += 32; + dataPtr += 32; + + } else { + + Data = _decode_avx(keys & 0x00FF, &dataPtr); + Prev = _write_avx_d1(out, Data, Prev); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + Prev = _write_avx_d1(out + 4, Data, Prev); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + Prev = _write_avx_d1(out + 8, Data, Prev); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + Prev = _write_avx_d1(out + 12, Data, Prev); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + Prev = _write_avx_d1(out + 16, Data, Prev); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + Prev = _write_avx_d1(out + 20, Data, Prev); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + Prev = _write_avx_d1(out + 24, Data, Prev); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + Prev = _write_avx_d1(out + 28, Data, Prev); + + out += 32; + } + } + prev = out[-1]; + } + uint64_t consumedkeys = keybytes - (keybytes & 7); + return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr, + count & 31, prev); } - - - - -uint8_t *svb_decode_avx_simple(uint32_t *out, uint8_t * __restrict__ keyPtr, - uint8_t * __restrict__ dataPtr, uint64_t count) { - - uint64_t keybytes = count / 4; // number of key bytes - xmm_t Data; - if(keybytes >= 8 ){ - - int64_t Offset = - (int64_t) keybytes / 8 + 1; - - const uint64_t * keyPtr64 = (const uint64_t *) keyPtr - Offset; - uint64_t nextkeys = keyPtr64[Offset]; - for (; Offset != 0; ++Offset) { - uint64_t keys = nextkeys; - nextkeys = keyPtr64[Offset + 1]; - - Data = _decode_avx((keys & 0xFF), &dataPtr); - _write_avx(out, Data); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - _write_avx(out + 4, Data); - - keys >>= 16; - Data = _decode_avx((keys & 0xFF), &dataPtr); - _write_avx(out + 8, Data); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - _write_avx(out + 12, Data); - - keys >>= 16; - Data = _decode_avx((keys & 0xFF), &dataPtr); - _write_avx(out + 16, Data); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - _write_avx(out + 20, Data); - - keys >>= 16; - Data = _decode_avx((keys & 0xFF), &dataPtr); - _write_avx(out + 24, Data); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - _write_avx(out + 28, Data); - - out += 32; - } - { - uint64_t keys = nextkeys; - - Data = _decode_avx((keys & 0xFF), &dataPtr); - _write_avx(out, Data); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - _write_avx(out + 4, Data); - - keys >>= 16; - Data = _decode_avx((keys & 0xFF), &dataPtr); - _write_avx(out + 8, Data); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - _write_avx(out + 12, Data); - - keys >>= 16; - Data = _decode_avx((keys & 0xFF), &dataPtr); - _write_avx(out + 16, Data); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - _write_avx(out + 20, Data); - - keys >>= 16; - Data = _decode_avx((keys & 0xFF), &dataPtr); - _write_avx(out + 24, Data); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - _write_avx(out + 28, Data); - - out += 32; - } - } - uint64_t consumedkeys = keybytes - (keybytes & 7); - return svb_decode_scalar(out, keyPtr + consumedkeys,dataPtr, count & 31); +uint8_t *svb_decode_avx_d1_simple(uint32_t *out, uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, + uint64_t count) { + return svb_decode_avx_d1_init(out, keyPtr, dataPtr, count, 0); } - -uint64_t svb_encode(uint8_t *out, uint32_t *in, uint32_t count, - int delta, int type) { - *(uint32_t *)out = count; // first 4 bytes is number of ints - uint8_t *keyPtr = out + 4; // keys come immediately after 32-bit count - uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte - uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys - - if (delta == 0 && type == 1) { - return svb_encode_scalar(in, keyPtr, dataPtr, count) - out; +uint8_t *svb_decode_avx_simple(uint32_t *out, uint8_t *__restrict__ keyPtr, + uint8_t *__restrict__ dataPtr, uint64_t count) { + + uint64_t keybytes = count / 4; // number of key bytes + xmm_t Data; + if (keybytes >= 8) { + + int64_t Offset = -(int64_t)keybytes / 8 + 1; + + const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; + uint64_t nextkeys = keyPtr64[Offset]; + for (; Offset != 0; ++Offset) { + uint64_t keys = nextkeys; + nextkeys = keyPtr64[Offset + 1]; + + Data = _decode_avx((keys & 0xFF), &dataPtr); + _write_avx(out, Data); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + _write_avx(out + 4, Data); + + keys >>= 16; + Data = _decode_avx((keys & 0xFF), &dataPtr); + _write_avx(out + 8, Data); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + _write_avx(out + 12, Data); + + keys >>= 16; + Data = _decode_avx((keys & 0xFF), &dataPtr); + _write_avx(out + 16, Data); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + _write_avx(out + 20, Data); + + keys >>= 16; + Data = _decode_avx((keys & 0xFF), &dataPtr); + _write_avx(out + 24, Data); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + _write_avx(out + 28, Data); + + out += 32; } - - if (delta == 1 && type == 1) { - return svb_encode_scalar_d1(in, keyPtr, dataPtr, count) - out; + { + uint64_t keys = nextkeys; + + Data = _decode_avx((keys & 0xFF), &dataPtr); + _write_avx(out, Data); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + _write_avx(out + 4, Data); + + keys >>= 16; + Data = _decode_avx((keys & 0xFF), &dataPtr); + _write_avx(out + 8, Data); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + _write_avx(out + 12, Data); + + keys >>= 16; + Data = _decode_avx((keys & 0xFF), &dataPtr); + _write_avx(out + 16, Data); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + _write_avx(out + 20, Data); + + keys >>= 16; + Data = _decode_avx((keys & 0xFF), &dataPtr); + _write_avx(out + 24, Data); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + _write_avx(out + 28, Data); + + out += 32; } + } + uint64_t consumedkeys = keybytes - (keybytes & 7); + return svb_decode_scalar(out, keyPtr + consumedkeys, dataPtr, count & 31); +} +uint64_t svb_encode(uint8_t *out, uint32_t *in, uint32_t count, int delta, + int type) { + *(uint32_t *)out = count; // first 4 bytes is number of ints + uint8_t *keyPtr = out + 4; // keys come immediately after 32-bit count + uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte + uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys - printf("Unknown delta (%d) type (%d) combination.\n", delta, type); - abort(); + if (delta == 0 && type == 1) { + return svb_encode_scalar(in, keyPtr, dataPtr, count) - out; + } + + if (delta == 1 && type == 1) { + return svb_encode_scalar_d1(in, keyPtr, dataPtr, count) - out; + } + + printf("Unknown delta (%d) type (%d) combination.\n", delta, type); + abort(); } uint64_t svb_decode(uint32_t *out, uint8_t *in, int delta, int type) { - uint32_t count = *(uint32_t *)in; // first 4 bytes is number of ints - if (count == 0) return 0; - - uint8_t *keyPtr = in + 4; // full list of keys is next - uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) - uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys - - if (delta == 0 && type == 1) { - return svb_decode_scalar(out, keyPtr, dataPtr, count) - in; - } + uint32_t count = *(uint32_t *)in; // first 4 bytes is number of ints + if (count == 0) + return 0; - if (delta == 1 && type == 1) { - return svb_decode_scalar_d1(out, keyPtr, dataPtr, count) - in; - } + uint8_t *keyPtr = in + 4; // full list of keys is next + uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up) + uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys + if (delta == 0 && type == 1) { + return svb_decode_scalar(out, keyPtr, dataPtr, count) - in; + } + if (delta == 1 && type == 1) { + return svb_decode_scalar_d1(out, keyPtr, dataPtr, count) - in; + } - if (delta == 0 && type == 5) { - return svb_decode_avx_simple(out, keyPtr, dataPtr, count) - in; - } + if (delta == 0 && type == 5) { + return svb_decode_avx_simple(out, keyPtr, dataPtr, count) - in; + } + if (delta == 1 && type == 5) { + return svb_decode_avx_d1_simple(out, keyPtr, dataPtr, count) - in; + } - if (delta == 1 && type == 5) { - return svb_decode_avx_d1_simple(out, keyPtr, dataPtr, count) - in; - } - - printf("Unknown delta (%d) type (%d) combination.\n", delta, type); - abort(); + printf("Unknown delta (%d) type (%d) combination.\n", delta, type); + abort(); } /* should emulate std:lower_bound*/ -static int lower_bound(uint32_t * A, uint32_t key, int imin, int imax) -{ - int imid; - imax --; - while(imin + 1 < imax) { - imid = imin + ((imax - imin) / 2); - - if (A[imid] >= key) { - imax = imid; - } else if (A[imid] < key) { - imin = imid; - } +static int lower_bound(uint32_t *A, uint32_t key, int imin, int imax) { + int imid; + imax--; + while (imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] >= key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; } - if(A[imin] >= key) return imin; - return imax; + } + if (A[imin] >= key) + return imin; + return imax; } -static SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes[16 * 16 ] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - }; - -static const __m128i *streamvbyte_shuffle_mask = (__m128i *) shuffle_mask_bytes; +static SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes[16 * 16] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, + 15, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, + 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, + 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, + 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + +static const __m128i *streamvbyte_shuffle_mask = (__m128i *)shuffle_mask_bytes; static inline int find_lower_bound(__m128i *PrevHi, __m128i *PrevLow, - uint32_t key, uint32_t *presult) -{ - int offset = 0; - int s; + uint32_t key, uint32_t *presult) { + int offset = 0; + int s; #if 0 // scalar code: @@ -955,486 +944,492 @@ static inline int find_lower_bound(__m128i *PrevHi, __m128i *PrevLow, return (s); #endif - uint32_t mask; - __m128i key4 = _mm_set1_epi32(key - 2147483648U); - __m128i conversion = _mm_set1_epi32(2147483648U); - __m128i tmp = _mm_sub_epi32(*PrevLow, conversion); + uint32_t mask; + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i tmp = _mm_sub_epi32(*PrevLow, conversion); + mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmp, key4))); + __m128i *out = PrevLow; + // not found in PrevLow? then try PrevHi + if (mask == 15) { + tmp = _mm_sub_epi32(*PrevHi, conversion); mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmp, key4))); - __m128i *out = PrevLow; - // not found in PrevLow? then try PrevHi - if (mask == 15) { - tmp = _mm_sub_epi32(*PrevHi, conversion); - mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmp, key4))); - out = PrevHi; - offset = 4; - } - // there MUST be a result! - assert(mask != 15); - const __m128i p = _mm_shuffle_epi8(*out, - streamvbyte_shuffle_mask[mask ^ 15]); - s = __builtin_ctz(mask ^ 15); - *presult = _mm_cvtsi128_si32(p); - - return (offset + s); + out = PrevHi; + offset = 4; + } + // there MUST be a result! + assert(mask != 15); + const __m128i p = _mm_shuffle_epi8(*out, streamvbyte_shuffle_mask[mask ^ 15]); + s = __builtin_ctz(mask ^ 15); + *presult = _mm_cvtsi128_si32(p); + + return (offset + s); } -static inline void _scan_16bit_avx_d1(xmm_t Vec, xmm_t *PrevHi, xmm_t *PrevLow) { - xmm_t Prev = *PrevHi; - // vec == [A B C D E F G H] (16 bit values) - xmm_t Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G] - Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit) - Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH] - Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF] - Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH] - *PrevLow = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit) - *PrevLow = _mm_add_epi32(*PrevLow, Prev); // [PA PAB PABC PABCD] (32-bit) - *PrevHi = _mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit) - *PrevHi = _mm_add_epi32(*PrevHi, *PrevLow); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit) +static inline void _scan_16bit_avx_d1(xmm_t Vec, xmm_t *PrevHi, + xmm_t *PrevLow) { + xmm_t Prev = *PrevHi; + // vec == [A B C D E F G H] (16 bit values) + xmm_t Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G] + Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit) + Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH] + Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF] + Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH] + *PrevLow = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit) + *PrevLow = _mm_add_epi32(*PrevLow, Prev); // [PA PAB PABC PABCD] (32-bit) + *PrevHi = _mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit) + *PrevHi = _mm_add_epi32( + *PrevHi, *PrevLow); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit) } static inline xmm_t _scan_avx_d1(xmm_t Vec, xmm_t Prev) { - xmm_t Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done) - Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P] - Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD] - Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB] - Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD] - return _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD] + xmm_t Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done) + Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P] + Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD] + Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB] + Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD] + return _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD] } -SIMDCOMP_ALIGNED(16) int8_t streamvbyte_shuffle_mask_bytes[256] = { - 0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0, - 4,5,6,7,0,0,0,0,0,0,0,0,0,0,0,0, - 8,9,10,11,0,0,0,0,0,0,0,0,0,0,0,0, - 12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0, - }; +SIMDCOMP_ALIGNED(16) +int8_t streamvbyte_shuffle_mask_bytes[256] = { + 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; static const __m128i *shuffle_mask = (__m128i *)streamvbyte_shuffle_mask_bytes; static inline uint32_t _extract_from_xmm(xmm_t *PrevHi, xmm_t *PrevLow, int i) { - static const int indices[] = {0, 0, 0, 0, 1, 1, 1, 1}; - xmm_t *prevs[2] = {PrevLow, PrevHi}; - return _mm_cvtsi128_si32( - _mm_shuffle_epi8(*prevs[indices[i]], shuffle_mask[i & 3])); + static const int indices[] = {0, 0, 0, 0, 1, 1, 1, 1}; + xmm_t *prevs[2] = {PrevLow, PrevHi}; + return _mm_cvtsi128_si32( + _mm_shuffle_epi8(*prevs[indices[i]], shuffle_mask[i & 3])); } -int svb_find_avx_d1_init(uint8_t *keyPtr, - uint8_t *dataPtr, uint64_t count, +int svb_find_avx_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, uint64_t count, uint32_t prev, uint32_t key, uint32_t *presult) { - uint64_t keybytes = count / 4; // number of key bytes - int consumedInts = 0; - - if (keybytes >= 8) { - xmm_t PrevHi = _mm_set1_epi32(prev); - xmm_t PrevLow; - xmm_t Data; - - int64_t Offset = -(int64_t) keybytes / 8 + 1; - - const uint64_t * keyPtr64 = (const uint64_t *) keyPtr - Offset; - uint64_t nextkeys = keyPtr64[Offset]; - for (; Offset != 0; ++Offset) { - uint64_t keys = nextkeys; - nextkeys = keyPtr64[Offset + 1]; - // faster 16-bit delta since we only have 8-bit values - if (!keys) { // 32 1-byte ints in a row - - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + s); - } - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 8))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 8 + s); - } - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 16))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 16 + s); - } - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 24))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 24 + s); - } - dataPtr += 32; - consumedInts += 32; - continue; - } - - Data = _decode_avx(keys & 0x00FF, &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + s); - } - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 8 + s); - } - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 16 + s); - } - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 24 + s); - } - - consumedInts += 32; - } - { - uint64_t keys = nextkeys; - // faster 16-bit delta since we only have 8-bit values - if (!keys) { // 32 1-byte ints in a row - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + s); - } - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 8))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 8 + s); - } - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 16))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 16 + s); - } - Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *) (dataPtr + 24))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 24 + s); - } - - dataPtr += 32; - consumedInts += 32; - - } else { - - Data = _decode_avx(keys & 0x00FF, &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + s); - } - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 8 + s); - } - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 16 + s); - } - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { - int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); - return (consumedInts + 24 + s); - } - - consumedInts += 32; - } - // extract the highest integer which was stored so far; - // it will be the "prev" value for the remaining data - prev = (uint32_t)_mm_extract_epi32(PrevHi, 3); - } - } - uint32_t keysleft = count & 31; - if (keysleft > 0) { - uint64_t consumedkeys = keybytes - (keybytes & 7); - uint32_t out[32]; - svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr, - keysleft, prev); - if (key <= out[keysleft - 1]) { - int s = lower_bound(out, key, 0, keysleft); - assert(s >= 0 && s < (int)keysleft); - *presult = out[s]; - return (consumedInts + s); + uint64_t keybytes = count / 4; // number of key bytes + int consumedInts = 0; + + if (keybytes >= 8) { + xmm_t PrevHi = _mm_set1_epi32(prev); + xmm_t PrevLow; + xmm_t Data; + + int64_t Offset = -(int64_t)keybytes / 8 + 1; + + const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; + uint64_t nextkeys = keyPtr64[Offset]; + for (; Offset != 0; ++Offset) { + uint64_t keys = nextkeys; + nextkeys = keyPtr64[Offset + 1]; + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + s); } + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 8 + s); + } + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 16 + s); + } + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 24))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 24 + s); + } + dataPtr += 32; + consumedInts += 32; + continue; + } + + Data = _decode_avx(keys & 0x00FF, &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + s); + } + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 8 + s); + } + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 16 + s); + } + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 24 + s); + } + + consumedInts += 32; } + { + uint64_t keys = nextkeys; + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + s); + } + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 8 + s); + } + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 16 + s); + } + Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *)(dataPtr + 24))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 24 + s); + } - // key was not found! - *presult = key + 1; - return (int) count; -} + dataPtr += 32; + consumedInts += 32; -int svb_find_scalar_d1_init(uint8_t *keyPtr, - uint8_t *dataPtr, uint64_t count, - uint32_t prev, uint32_t searchkey, uint32_t *presult) { - uint8_t shift = 0; - uint32_t key = *keyPtr++; + } else { - for (uint32_t c = 0; c < count; c++) { - if (shift == 8) { - shift = 0; - key = *keyPtr++; + Data = _decode_avx(keys & 0x00FF, &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + s); } - prev += _decode_data(&dataPtr, (key >> shift) & 0x3); - if (prev >= searchkey) { - *presult = prev; - return (c); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 8 + s); + } + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 16 + s); } - shift += 2; + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) { + int s = find_lower_bound(&PrevHi, &PrevLow, key, presult); + return (consumedInts + 24 + s); + } + + consumedInts += 32; + } + // extract the highest integer which was stored so far; + // it will be the "prev" value for the remaining data + prev = (uint32_t)_mm_extract_epi32(PrevHi, 3); + } + } + uint32_t keysleft = count & 31; + if (keysleft > 0) { + uint64_t consumedkeys = keybytes - (keybytes & 7); + uint32_t out[32]; + svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr, keysleft, + prev); + if (key <= out[keysleft - 1]) { + int s = lower_bound(out, key, 0, keysleft); + assert(s >= 0 && s < (int)keysleft); + *presult = out[s]; + return (consumedInts + s); } + } - *presult = searchkey + 1; - return (int) count; + // key was not found! + *presult = key + 1; + return (int)count; } +int svb_find_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, uint64_t count, + uint32_t prev, uint32_t searchkey, + uint32_t *presult) { + uint8_t shift = 0; + uint32_t key = *keyPtr++; + + for (uint32_t c = 0; c < count; c++) { + if (shift == 8) { + shift = 0; + key = *keyPtr++; + } + prev += _decode_data(&dataPtr, (key >> shift) & 0x3); + if (prev >= searchkey) { + *presult = prev; + return (c); + } + + shift += 2; + } + + *presult = searchkey + 1; + return (int)count; +} uint32_t svb_select_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, - uint64_t count, uint32_t prev, int slot) { - uint8_t shift = 0; - uint32_t key = *keyPtr++; + uint64_t count, uint32_t prev, int slot) { + uint8_t shift = 0; + uint32_t key = *keyPtr++; - (void)count; + (void)count; - // make sure that the loop is run at least once - for (int c = 0; c <= slot; c++) { - if (shift == 8) { - shift = 0; - key = *keyPtr++; - } - prev += _decode_data(&dataPtr, (key >> shift) & 0x3); - shift += 2; + // make sure that the loop is run at least once + for (int c = 0; c <= slot; c++) { + if (shift == 8) { + shift = 0; + key = *keyPtr++; } + prev += _decode_data(&dataPtr, (key >> shift) & 0x3); + shift += 2; + } - return prev; + return prev; } -uint32_t svb_select_avx_d1_init(uint8_t *keyPtr, - uint8_t *dataPtr, uint64_t count, - uint32_t prev, int slot) { - uint64_t keybytes = count / 4; // number of key bytes - int consumedInts = 0; - - // make sure that the loop is run at least once - if (keybytes >= 8) { - xmm_t PrevHi = _mm_set1_epi32(prev); - xmm_t PrevLow; - xmm_t Data; - - int64_t Offset = -(int64_t) keybytes / 8 + 1; - - const uint64_t * keyPtr64 = (const uint64_t *) keyPtr - Offset; - uint64_t nextkeys = keyPtr64[Offset]; - for (; Offset != 0; ++Offset) { - uint64_t keys = nextkeys; - nextkeys = keyPtr64[Offset + 1]; - // faster 16-bit delta since we only have 8-bit values - if (!keys) { // 32 1-byte ints in a row - - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (slot < consumedInts + 8) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts); - - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 8))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (slot < consumedInts + 16) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 8)); - - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 16))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (slot < consumedInts + 24) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 16)); - - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 24))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (slot < consumedInts + 32) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 24)); - - dataPtr += 32; - consumedInts += 32; - continue; - } - - Data = _decode_avx(keys & 0x00FF, &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (slot < consumedInts + 8) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (slot < consumedInts + 16) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 8)); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (slot < consumedInts + 24) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 16)); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (slot < consumedInts + 32) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 24)); - - consumedInts += 32; - } - - { - uint64_t keys = nextkeys; - // faster 16-bit delta since we only have 8-bit values - if (!keys) { // 32 1-byte ints in a row - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (slot < consumedInts + 8) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts); - - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 8))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (slot < consumedInts + 16) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 8)); - - Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *) (dataPtr + 16))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (slot < consumedInts + 24) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 16)); - - Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *) (dataPtr + 24))); - _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); - // check 8 ints - if (slot < consumedInts + 32) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 24)); - - dataPtr += 32; - consumedInts += 32; - - } else { - - Data = _decode_avx(keys & 0x00FF, &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (slot < consumedInts + 8) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (slot < consumedInts + 16) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 8)); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (slot < consumedInts + 24) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 16)); - - keys >>= 16; - Data = _decode_avx((keys & 0x00FF), &dataPtr); - PrevLow = _scan_avx_d1(Data, PrevHi); - Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); - PrevHi = _scan_avx_d1(Data, PrevLow); - // check 8 ints - if (slot < consumedInts + 32) - return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 24)); - - consumedInts += 32; - } - // extract the highest integer which was stored so far; - // it will be the "prev" value for the remaining data - prev = (uint32_t)_mm_extract_epi32(PrevHi, 3); - } - } - uint64_t consumedkeys = keybytes - (keybytes & 7); - uint32_t keysleft = count & 31; - return svb_select_scalar_d1_init(keyPtr + consumedkeys, dataPtr, - keysleft, prev, slot - consumedInts); -} +uint32_t svb_select_avx_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, + uint64_t count, uint32_t prev, int slot) { + uint64_t keybytes = count / 4; // number of key bytes + int consumedInts = 0; + + // make sure that the loop is run at least once + if (keybytes >= 8) { + xmm_t PrevHi = _mm_set1_epi32(prev); + xmm_t PrevLow; + xmm_t Data; + + int64_t Offset = -(int64_t)keybytes / 8 + 1; + + const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset; + uint64_t nextkeys = keyPtr64[Offset]; + for (; Offset != 0; ++Offset) { + uint64_t keys = nextkeys; + nextkeys = keyPtr64[Offset + 1]; + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (slot < consumedInts + 8) + return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts); + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (slot < consumedInts + 16) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 8)); + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (slot < consumedInts + 24) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 16)); + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 24))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (slot < consumedInts + 32) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 24)); + + dataPtr += 32; + consumedInts += 32; + continue; + } + + Data = _decode_avx(keys & 0x00FF, &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (slot < consumedInts + 8) + return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (slot < consumedInts + 16) + return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 8)); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (slot < consumedInts + 24) + return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 16)); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (slot < consumedInts + 32) + return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 24)); + + consumedInts += 32; + } + { + uint64_t keys = nextkeys; + // faster 16-bit delta since we only have 8-bit values + if (!keys) { // 32 1-byte ints in a row + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (slot < consumedInts + 8) + return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts); + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (slot < consumedInts + 16) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 8)); + + Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (slot < consumedInts + 24) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 16)); + + Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *)(dataPtr + 24))); + _scan_16bit_avx_d1(Data, &PrevHi, &PrevLow); + // check 8 ints + if (slot < consumedInts + 32) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 24)); + + dataPtr += 32; + consumedInts += 32; + + } else { + + Data = _decode_avx(keys & 0x00FF, &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (slot < consumedInts + 8) + return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (slot < consumedInts + 16) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 8)); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (slot < consumedInts + 24) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 16)); + + keys >>= 16; + Data = _decode_avx((keys & 0x00FF), &dataPtr); + PrevLow = _scan_avx_d1(Data, PrevHi); + Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr); + PrevHi = _scan_avx_d1(Data, PrevLow); + // check 8 ints + if (slot < consumedInts + 32) + return _extract_from_xmm(&PrevHi, &PrevLow, + slot - (consumedInts + 24)); + + consumedInts += 32; + } + // extract the highest integer which was stored so far; + // it will be the "prev" value for the remaining data + prev = (uint32_t)_mm_extract_epi32(PrevHi, 3); + } + } + uint64_t consumedkeys = keybytes - (keybytes & 7); + uint32_t keysleft = count & 31; + return svb_select_scalar_d1_init(keyPtr + consumedkeys, dataPtr, keysleft, + prev, slot - consumedInts); +} diff --git a/src/testcodecs.cpp b/src/testcodecs.cpp index f4b35e8..40299d3 100644 --- a/src/testcodecs.cpp +++ b/src/testcodecs.cpp @@ -12,207 +12,209 @@ using namespace SIMDCompressionLib; struct dataarray { - dataarray() : - name(), data() { - } - string name; - vector> data; + dataarray() : name(), data() {} + string name; + vector> data; }; - class EntropyRecorder { public: - EntropyRecorder() : - counter(), totallength(0) { - } - - void clear() { - counter.clear(); - totallength = 0; + EntropyRecorder() : counter(), totallength(0) {} + + void clear() { + counter.clear(); + totallength = 0; + } + void eat(const uint32_t *in, const size_t length) { + if (length == 0) + return; + totallength += length; + for (uint32_t k = 0; k < length; ++k, ++in) { + maptype::iterator i = counter.find(*in); + if (i != counter.end()) + i->second += 1; + else + counter[*in] = 1; } - void eat(const uint32_t *in, const size_t length) { - if (length == 0) - return; - totallength += length; - for (uint32_t k = 0; k < length; ++k, ++in) { - maptype::iterator i = counter.find(*in); - if (i != counter.end()) - i->second += 1; - else - counter[*in] = 1; - } + } + + double computeShannon() { + double total = 0; + for (maptype::iterator i = counter.begin(); i != counter.end(); ++i) { + const double x = static_cast(i->second); + total += x / static_cast(totallength) * + log(static_cast(totallength) / x) / log(2.0); } - - double computeShannon() { - double total = 0; - for (maptype::iterator i = counter.begin(); i - != counter.end(); ++i) { - const double x = static_cast(i->second); - total += x / static_cast(totallength) * log(static_cast(totallength) / x) / log(2.0); - } - return total; - } - - __attribute__((pure)) - double computeDataBits() { - double total = 0; - for (maptype::const_iterator i = counter.begin(); i - != counter.end(); ++i) { - total += static_cast(i->second) / static_cast(totallength) * static_cast(gccbits(i->first)); - } - return total; + return total; + } + + __attribute__((pure)) double computeDataBits() { + double total = 0; + for (maptype::const_iterator i = counter.begin(); i != counter.end(); ++i) { + total += static_cast(i->second) / + static_cast(totallength) * + static_cast(gccbits(i->first)); } - typedef unordered_map maptype; - maptype counter; - size_t totallength; + return total; + } + typedef unordered_map maptype; + maptype counter; + size_t totallength; }; - - -void sillybenchmark(vector datas, - vector &compressedbuffer, vector &recoverybuffer, - IntegerCODEC &codec) { - cout << "#benchmarking " << CODECFactory::getName(codec) << endl;//codec.name() - WallClockTimer z; - double packtime, unpacktime; - cout << "#name , bits/int , coding speed (mis) , decoding speed (mis)" - << endl; - for (vector::const_iterator it = datas.begin(); it - != datas.end(); ++it) { - const vector> &data = it->data; - vector membuffer; - packtime = 0; - unpacktime = 0; - double compsize = 0; - double intcounter = 0; - // dry run - for (const vector &D : data) { - vector dirtycopy(D); - size_t nvalue = compressedbuffer.size(); +void sillybenchmark(vector datas, vector &compressedbuffer, + vector &recoverybuffer, IntegerCODEC &codec) { + cout << "#benchmarking " << CODECFactory::getName(codec) + << endl; // codec.name() + WallClockTimer z; + double packtime, unpacktime; + cout << "#name , bits/int , coding speed (mis) , decoding speed (mis)" + << endl; + for (vector::const_iterator it = datas.begin(); it != datas.end(); + ++it) { + const vector> &data = it->data; + vector membuffer; + packtime = 0; + unpacktime = 0; + double compsize = 0; + double intcounter = 0; + // dry run + for (const vector &D : data) { + vector dirtycopy(D); + size_t nvalue = compressedbuffer.size(); #ifdef USE_ALIGNED - codec.encodeArray(dirtycopy.data(), dirtycopy.size(), - compressedbuffer.data(), nvalue); + codec.encodeArray(dirtycopy.data(), dirtycopy.size(), + compressedbuffer.data(), nvalue); #else - codec.encodeArray(dirtycopy.data()+1, dirtycopy.size()-1, // use unaligned address - compressedbuffer.data(), nvalue); + codec.encodeArray(dirtycopy.data() + 1, + dirtycopy.size() - 1, // use unaligned address + compressedbuffer.data(), nvalue); #endif - size_t recoveredvalues = recoverybuffer.size(); - codec.decodeArray(compressedbuffer.data(), nvalue, - recoverybuffer.data(), recoveredvalues); + size_t recoveredvalues = recoverybuffer.size(); + codec.decodeArray(compressedbuffer.data(), nvalue, recoverybuffer.data(), + recoveredvalues); #ifdef USE_ALIGNED - if (recoveredvalues != dirtycopy.size()) throw runtime_error("bug"); + if (recoveredvalues != dirtycopy.size()) + throw runtime_error("bug"); #else - if (recoveredvalues != dirtycopy.size()-1) throw runtime_error("bug"); + if (recoveredvalues != dirtycopy.size() - 1) + throw runtime_error("bug"); #endif - } - // actual run - for (const vector &D : data) { - vector dirtycopy(D); - intcounter += static_cast(dirtycopy.size()); - size_t nvalue = compressedbuffer.size(); - z.reset(); + } + // actual run + for (const vector &D : data) { + vector dirtycopy(D); + intcounter += static_cast(dirtycopy.size()); + size_t nvalue = compressedbuffer.size(); + z.reset(); #ifdef USE_ALIGNED - codec.encodeArray(dirtycopy.data(), dirtycopy.size(), - compressedbuffer.data(), nvalue); + codec.encodeArray(dirtycopy.data(), dirtycopy.size(), + compressedbuffer.data(), nvalue); #else - codec.encodeArray(dirtycopy.data()+1, dirtycopy.size()-1, // use unaligned address - compressedbuffer.data(), nvalue); + codec.encodeArray(dirtycopy.data() + 1, + dirtycopy.size() - 1, // use unaligned address + compressedbuffer.data(), nvalue); #endif - packtime += static_cast(z.split()); - compsize += static_cast(nvalue); - size_t recoveredvalues = recoverybuffer.size(); - double bestunpacktime = std::numeric_limits::infinity(); - for (int t = 0; t < 5; ++t) { - z.reset(); - codec.decodeArray(compressedbuffer.data(), nvalue, - recoverybuffer.data(), recoveredvalues); - double tup = static_cast(z.split()); - if (tup < bestunpacktime) bestunpacktime = tup; - } - unpacktime += bestunpacktime; + packtime += static_cast(z.split()); + compsize += static_cast(nvalue); + size_t recoveredvalues = recoverybuffer.size(); + double bestunpacktime = std::numeric_limits::infinity(); + for (int t = 0; t < 5; ++t) { + z.reset(); + codec.decodeArray(compressedbuffer.data(), nvalue, + recoverybuffer.data(), recoveredvalues); + double tup = static_cast(z.split()); + if (tup < bestunpacktime) + bestunpacktime = tup; + } + unpacktime += bestunpacktime; #ifdef USE_ALIGNED - if (recoveredvalues != dirtycopy.size()) throw runtime_error("bug"); + if (recoveredvalues != dirtycopy.size()) + throw runtime_error("bug"); #else - if (recoveredvalues != dirtycopy.size()-1) throw runtime_error("bug"); + if (recoveredvalues != dirtycopy.size() - 1) + throw runtime_error("bug"); #endif - } - cout << std::setprecision(4) << it->name << "\t" - << (static_cast(compsize) * 32.0 / intcounter - ) << "\t" - << intcounter - / static_cast(packtime) << "\t" - << intcounter - / static_cast(unpacktime) << endl; } - cout << endl; + cout << std::setprecision(4) << it->name << "\t" + << (static_cast(compsize) * 32.0 / intcounter) << "\t" + << intcounter / static_cast(packtime) << "\t" + << intcounter / static_cast(unpacktime) << endl; + } + cout << endl; } - - void benchmark(const uint32_t S, vector> &allcodecs) { - const uint32_t N = 1U << S; - cout << "##########################################################################"< datas; - cout << "# generating data..."; - cout << endl; - int Times = static_cast( - round( - 40.0 * static_cast(1U << 16) / static_cast(N) - ) - ); - if (Times == 0) Times = 1; - cout << "# Generating " << Times << " array for each gap, volume is " << Times *N << " ints" << endl; - for (uint32_t gap = 1; gap + S <= 31; gap += 1) { - dataarray X; - EntropyRecorder er; - for (int k = 0; k < Times; ++k) { - X.data.push_back(cdg.generateClustered(N, 1U << (gap + S))); - vector copy(X.data.back()); - delta(0U, copy.data(), copy.size()); - er.eat(copy.data(), copy.size()); - } - cout << "# entropy of " << gap << " is " << er.computeShannon() << endl; - ostringstream convert; - convert << gap; - X.name = convert.str(); - datas.push_back(X); + const uint32_t N = 1U << S; + cout << "####################################################################" + "######" + << endl; + cout << "# Warning: this microbench is sensitive to background processes." + << endl; + cout << "# You may need to repeat the test over several minutes or " + "hours" + << endl; + cout << "# if you cannot get exclusive access to the CPU and its " + "cache." + << endl; + cout << "####################################################################" + "######" + << endl; + cout << "# using arrays of size " << N << endl; + ClusteredDataGenerator cdg(0); // used fixed seed to lower variance + vector datas; + cout << "# generating data..."; + cout << endl; + int Times = static_cast( + round(40.0 * static_cast(1U << 16) / static_cast(N))); + if (Times == 0) + Times = 1; + cout << "# Generating " << Times << " array for each gap, volume is " + << Times * N << " ints" << endl; + for (uint32_t gap = 1; gap + S <= 31; gap += 1) { + dataarray X; + EntropyRecorder er; + for (int k = 0; k < Times; ++k) { + X.data.push_back(cdg.generateClustered(N, 1U << (gap + S))); + vector copy(X.data.back()); + delta(0U, copy.data(), copy.size()); + er.eat(copy.data(), copy.size()); } - vector compressedbuffer; - compressedbuffer.resize(N * 2); - vector recoverybuffer; - recoverybuffer.resize(N); - for (auto i : allcodecs) - sillybenchmark(datas, compressedbuffer, recoverybuffer, *i); + cout << "# entropy of " << gap << " is " << er.computeShannon() << endl; + ostringstream convert; + convert << gap; + X.name = convert.str(); + datas.push_back(X); + } + vector compressedbuffer; + compressedbuffer.resize(N * 2); + vector recoverybuffer; + recoverybuffer.resize(N); + for (auto i : allcodecs) + sillybenchmark(datas, compressedbuffer, recoverybuffer, *i); } void displayUsage() { - cout << "run as testcodecs nameofcodec1 nameofcodec2 ..." << endl; - cout << "where codecs are:" << endl; - vector all = CODECFactory::allNames(); - for (auto i = all.begin(); i != all.end(); ++i) { - cout << *i << endl; - } + cout << "run as testcodecs nameofcodec1 nameofcodec2 ..." << endl; + cout << "where codecs are:" << endl; + vector all = CODECFactory::allNames(); + for (auto i = all.begin(); i != all.end(); ++i) { + cout << *i << endl; + } } int main(int argc, char **argv) { - if (argc <= 1) { - displayUsage(); - return -1; - } - vector> allcodecs; - for (int k = 1; k < argc; ++k) { - shared_ptr p = CODECFactory::getFromName(argv[k]); - if (p.get() == NULL) - return -2; - allcodecs.push_back(p); - } - benchmark(16, allcodecs); - return 0; - + if (argc <= 1) { + displayUsage(); + return -1; + } + vector> allcodecs; + for (int k = 1; k < argc; ++k) { + shared_ptr p = CODECFactory::getFromName(argv[k]); + if (p.get() == NULL) + return -2; + allcodecs.push_back(p); + } + benchmark(16, allcodecs); + return 0; } diff --git a/src/testintegration.cpp b/src/testintegration.cpp index 1697623..d655bc2 100644 --- a/src/testintegration.cpp +++ b/src/testintegration.cpp @@ -15,160 +15,157 @@ using namespace std; using namespace SIMDCompressionLib; - vector maskedcopy(const vector &in, const uint32_t bit) { - vector out(in); - if (bit == 32) - return out; - for (auto i = out.begin(); i != out.end(); ++i) { - *i = *i % (1U << bit); - } + vector out(in); + if (bit == 32) return out; + for (auto i = out.begin(); i != out.end(); ++i) { + *i = *i % (1U << bit); + } + return out; } -template +template bool equalOnFirstBits(const container32bit &data, const container32bit &recovered, uint32_t bit) { - if (bit == 32) { - return data == recovered; - } - for (uint32_t k = 0; k < data.size(); ++k) { - if (data[k] % (1U << bit) != recovered[k] % (1U << bit)) { - cout << " They differ at k = " << k << " data[k]= " << data[k] - << " recovered[k]=" << recovered[k] << endl; - return false; - } + if (bit == 32) { + return data == recovered; + } + for (uint32_t k = 0; k < data.size(); ++k) { + if (data[k] % (1U << bit) != recovered[k] % (1U << bit)) { + cout << " They differ at k = " << k << " data[k]= " << data[k] + << " recovered[k]=" << recovered[k] << endl; + return false; } - return true; + } + return true; } uint32_t mask(uint32_t bit) { - if (bit == 32) return 0xFFFFFFFFU; - return (1U << bit) - 1; + if (bit == 32) + return 0xFFFFFFFFU; + return (1U << bit) - 1; } template void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) { - T = T + 1; // we have a warming up pass - uint32_t bogus = 0; - vector data(N); - vector compressed(N); - vector icompressed(N); - vector recovered(N); - WallClockTimer z; - double unpacktime; - double iunpacktime; - - cout << "#million of integers per second: higher is better" << endl; - cout << "#bit, unpack,iunpack" << endl; - - - for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) { - uint32_t bit = bitindex + 1; - vector initdata(N); - for (size_t i = 0 ; 4 * i < data.size() ; i += 4) { - initdata[i] = random(bit) + (i >= 4 ? initdata[i - 4] : 0); - for (size_t j = 1; j < 4; ++j) { - initdata[i + j] = initdata[i]; - } - } - - const vector refdata = initdata; - vector().swap(initdata); - - icompressed.clear(); - // 4 * N should be enough for all schemes - icompressed.resize(4 * N, 0); - compressed.clear(); - // 4 * N should be enough for all schemes - compressed.resize(4 * N, 0); - recovered.clear(); - recovered.resize(N, 0); - - if (needPaddingTo128Bits(recovered.data())) { - throw logic_error("Array is not aligned on 128 bit boundary!"); - } - if (needPaddingTo128Bits(icompressed.data())) { - throw logic_error("Array is not aligned on 128 bit boundary!"); - } - if (needPaddingTo128Bits(compressed.data())) { - throw logic_error("Array is not aligned on 128 bit boundary!"); - } - if (needPaddingTo128Bits(refdata.data())) { - throw logic_error("Array is not aligned on 128 bit boundary!"); - } + T = T + 1; // we have a warming up pass + uint32_t bogus = 0; + vector data(N); + vector compressed(N); + vector icompressed(N); + vector recovered(N); + WallClockTimer z; + double unpacktime; + double iunpacktime; + + cout << "#million of integers per second: higher is better" << endl; + cout << "#bit, unpack,iunpack" << endl; + + for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) { + uint32_t bit = bitindex + 1; + vector initdata(N); + for (size_t i = 0; 4 * i < data.size(); i += 4) { + initdata[i] = random(bit) + (i >= 4 ? initdata[i - 4] : 0); + for (size_t j = 1; j < 4; ++j) { + initdata[i + j] = initdata[i]; + } + } + const vector refdata = initdata; + vector().swap(initdata); - for (uint32_t repeat = 0; repeat < 1; ++repeat) { + icompressed.clear(); + // 4 * N should be enough for all schemes + icompressed.resize(4 * N, 0); + compressed.clear(); + // 4 * N should be enough for all schemes + compressed.resize(4 * N, 0); + recovered.clear(); + recovered.resize(N, 0); - unpacktime = 0; + if (needPaddingTo128Bits(recovered.data())) { + throw logic_error("Array is not aligned on 128 bit boundary!"); + } + if (needPaddingTo128Bits(icompressed.data())) { + throw logic_error("Array is not aligned on 128 bit boundary!"); + } + if (needPaddingTo128Bits(compressed.data())) { + throw logic_error("Array is not aligned on 128 bit boundary!"); + } + if (needPaddingTo128Bits(refdata.data())) { + throw logic_error("Array is not aligned on 128 bit boundary!"); + } - iunpacktime = 0; + for (uint32_t repeat = 0; repeat < 1; ++repeat) { - for (uint32_t t = 0; t <= T; ++t) { + unpacktime = 0; - assert(data.size() == refdata.size()); - fill(icompressed.begin(), icompressed.end(), 0); - fill(recovered.begin(), recovered.end(), 0); - memcpy(data.data(), refdata.data(), data.size() * sizeof(uint32_t));//memcpy can be slow - Helper::pack(data.data(), data.size(), icompressed.data(), bit); - z.reset(); - Helper::unpack(icompressed.data(), refdata.size(), recovered.data(), bit); - if (t > 0)// we don't count the first run - unpacktime += static_cast(z.split()); - if (!equalOnFirstBits(refdata, recovered, bit)) { - cout << " Bug 1a " << bit << endl; - return; - } - memcpy(data.data(), refdata.data(), data.size() * sizeof(uint32_t));//memcpy can be slow - Helper::pack(data.data(), data.size(), icompressed.data(), bit); + iunpacktime = 0; - z.reset(); - Helper::iunpack(icompressed.data(), refdata.size(), recovered.data(), bit); - if (t > 0)// we don't count the first run - iunpacktime += static_cast(z.split()); - if (!equalOnFirstBits(refdata, recovered, bit)) { - cout << " Bug 2 " << bit << endl; - return; - } - } + for (uint32_t t = 0; t <= T; ++t) { - cout << std::setprecision(4) << bit << "\t\t"; - cout << "\t\t" << N * (T - 1) / (unpacktime) << "\t\t"; + assert(data.size() == refdata.size()); + fill(icompressed.begin(), icompressed.end(), 0); + fill(recovered.begin(), recovered.end(), 0); + memcpy(data.data(), refdata.data(), + data.size() * sizeof(uint32_t)); // memcpy can be slow + Helper::pack(data.data(), data.size(), icompressed.data(), bit); + z.reset(); + Helper::unpack(icompressed.data(), refdata.size(), recovered.data(), + bit); + if (t > 0) // we don't count the first run + unpacktime += static_cast(z.split()); + if (!equalOnFirstBits(refdata, recovered, bit)) { + cout << " Bug 1a " << bit << endl; + return; + } + memcpy(data.data(), refdata.data(), + data.size() * sizeof(uint32_t)); // memcpy can be slow + Helper::pack(data.data(), data.size(), icompressed.data(), bit); + + z.reset(); + Helper::iunpack(icompressed.data(), refdata.size(), recovered.data(), + bit); + if (t > 0) // we don't count the first run + iunpacktime += static_cast(z.split()); + if (!equalOnFirstBits(refdata, recovered, bit)) { + cout << " Bug 2 " << bit << endl; + return; + } + } - cout << "\t\t" << N * (T - 1) / (iunpacktime); + cout << std::setprecision(4) << bit << "\t\t"; + cout << "\t\t" << N * (T - 1) / (unpacktime) << "\t\t"; - cout << endl; - } + cout << "\t\t" << N * (T - 1) / (iunpacktime); + cout << endl; } - cout << "# ignore this " << bogus << endl; - + } + cout << "# ignore this " << bogus << endl; } - - int main() { - cout << "# SIMD bit-packing (regular) cache-to-cache 2^12" << endl; - simplebenchmark>(1U << 12, 1U << 14); - cout << endl; - cout << "# SIMD bit-packing (coarse delta 2) cache-to-cache 2^12" << endl; - simplebenchmark>(1U << 12, 1U << 14); - cout << endl; - - cout << "# SIMD bit-packing (coarse max 4) cache-to-cache 2^12" << endl; - simplebenchmark>(1U << 12, 1U << 14); - cout << endl; + cout << "# SIMD bit-packing (regular) cache-to-cache 2^12" << endl; + simplebenchmark>(1U << 12, 1U << 14); + cout << endl; + cout << "# SIMD bit-packing (coarse delta 2) cache-to-cache 2^12" << endl; + simplebenchmark>(1U << 12, 1U << 14); + cout << endl; + cout << "# SIMD bit-packing (coarse max 4) cache-to-cache 2^12" << endl; + simplebenchmark>(1U << 12, 1U << 14); + cout << endl; - cout << "# SIMD bit-packing (coarse delta 4) cache-to-cache 2^12" << endl; - simplebenchmark>(1U << 12, 1U << 14); - cout << endl; + cout << "# SIMD bit-packing (coarse delta 4) cache-to-cache 2^12" << endl; + simplebenchmark>(1U << 12, 1U << 14); + cout << endl; - cout << "# Scalar cache-to-cache 2^12" << endl; - simplebenchmark(1U << 12, 1U << 14); - cout << endl; + cout << "# Scalar cache-to-cache 2^12" << endl; + simplebenchmark(1U << 12, 1U << 14); + cout << endl; - return 0; + return 0; } diff --git a/src/unit.cpp b/src/unit.cpp index d6cd87b..63e7728 100644 --- a/src/unit.cpp +++ b/src/unit.cpp @@ -4,7 +4,6 @@ * */ - #include "common.h" #include "bitpackinghelpers.h" #include "simdbitpackinghelpers.h" @@ -22,701 +21,690 @@ using namespace std; using namespace SIMDCompressionLib; struct dataarray { - dataarray() : name(), data() {} - string name; - vector data; + dataarray() : name(), data() {} + string name; + vector data; }; - void testSmall(vector> codecs) { - vector data; - data.push_back(1U); - data.push_back(3U); - data.push_back(5U); - data.push_back(15U + 1024U); - data.push_back(21U + 1024U); - - for (shared_ptr codec : codecs) { - vector dirtycopy(data); - vector compressedbuffer(data.size() + 1024); - vector recoverybuffer(data.size() + 1024); - size_t nvalue = compressedbuffer.size(); - codec->encodeArray(dirtycopy.data(), dirtycopy.size(), compressedbuffer.data(), nvalue); - size_t recoveredvalues = recoverybuffer.size(); - codec->decodeArray(compressedbuffer.data(), nvalue, recoverybuffer.data(), recoveredvalues); - recoverybuffer.resize(recoveredvalues); - if (data != recoverybuffer) { - cout << "Problem with " << codec->name() << endl; - for (size_t i = 0; i < data.size(); ++i) - cout << i << " " << data[i] << " " << recoverybuffer[i] << endl; - throw std::logic_error("bug"); - } + vector data; + data.push_back(1U); + data.push_back(3U); + data.push_back(5U); + data.push_back(15U + 1024U); + data.push_back(21U + 1024U); + + for (shared_ptr codec : codecs) { + vector dirtycopy(data); + vector compressedbuffer(data.size() + 1024); + vector recoverybuffer(data.size() + 1024); + size_t nvalue = compressedbuffer.size(); + codec->encodeArray(dirtycopy.data(), dirtycopy.size(), + compressedbuffer.data(), nvalue); + size_t recoveredvalues = recoverybuffer.size(); + codec->decodeArray(compressedbuffer.data(), nvalue, recoverybuffer.data(), + recoveredvalues); + recoverybuffer.resize(recoveredvalues); + if (data != recoverybuffer) { + cout << "Problem with " << codec->name() << endl; + for (size_t i = 0; i < data.size(); ++i) + cout << i << " " << data[i] << " " << recoverybuffer[i] << endl; + throw std::logic_error("bug"); } + } } - void sillyunittest(vector datas, vector &compressedbuffer, vector &recoverybuffer, IntegerCODEC &codec) { - for (vector::const_iterator i = datas.begin() ; - i != datas.end() ; ++i) { - const vector &data = i->data; - vector dirtycopy(data); - size_t nvalue = compressedbuffer.size(); - codec.encodeArray(dirtycopy.data(), dirtycopy.size(), compressedbuffer.data(), nvalue); - size_t recoveredvalues = recoverybuffer.size(); - codec.decodeArray(compressedbuffer.data(), nvalue, recoverybuffer.data(), recoveredvalues); - recoverybuffer.resize(recoveredvalues); - if (data != recoverybuffer) { - cout << "Problem with " << codec.name() << endl; - size_t howmany = 0; - for (size_t i = 0; i < data.size(); ++i) { - if (data[i] != recoverybuffer[i]) { - if (++howmany > 5) { - cout << "..." << endl; - break; - } - cout << i << " " << data[i] << " " << recoverybuffer[i] << endl; - } - } - throw std::logic_error("bug"); + for (vector::const_iterator i = datas.begin(); i != datas.end(); + ++i) { + const vector &data = i->data; + vector dirtycopy(data); + size_t nvalue = compressedbuffer.size(); + codec.encodeArray(dirtycopy.data(), dirtycopy.size(), + compressedbuffer.data(), nvalue); + size_t recoveredvalues = recoverybuffer.size(); + codec.decodeArray(compressedbuffer.data(), nvalue, recoverybuffer.data(), + recoveredvalues); + recoverybuffer.resize(recoveredvalues); + if (data != recoverybuffer) { + cout << "Problem with " << codec.name() << endl; + size_t howmany = 0; + for (size_t i = 0; i < data.size(); ++i) { + if (data[i] != recoverybuffer[i]) { + if (++howmany > 5) { + cout << "..." << endl; + break; + } + cout << i << " " << data[i] << " " << recoverybuffer[i] << endl; } + } + throw std::logic_error("bug"); } - cout.flush(); + } + cout.flush(); } -void unittrivial(bool deltacode, vector> &allcodecs, const uint32_t S) { - const uint32_t N = 1U << S; - vector datas; - dataarray X; - vector d(N); - for (uint32_t k = 0; k < N; ++k) - d[k] = k; - X.data = d; - if (deltacode) - delta(0u, X.data.data(), X.data.size()); - - ostringstream convert; - convert << N; - X.name = convert.str(); - datas.push_back(X); - vector compressedbuffer; - compressedbuffer.resize(N * 2); - vector recoverybuffer; - recoverybuffer.resize(N); - for (auto i : allcodecs) - sillyunittest(datas, compressedbuffer, recoverybuffer, *i); +void unittrivial(bool deltacode, vector> &allcodecs, + const uint32_t S) { + const uint32_t N = 1U << S; + vector datas; + dataarray X; + vector d(N); + for (uint32_t k = 0; k < N; ++k) + d[k] = k; + X.data = d; + if (deltacode) + delta(0u, X.data.data(), X.data.size()); + + ostringstream convert; + convert << N; + X.name = convert.str(); + datas.push_back(X); + vector compressedbuffer; + compressedbuffer.resize(N * 2); + vector recoverybuffer; + recoverybuffer.resize(N); + for (auto i : allcodecs) + sillyunittest(datas, compressedbuffer, recoverybuffer, *i); } +void unit(bool deltacode, vector> &allcodecs, + const uint32_t S, int seed) { + const uint32_t N = 1U << S; + ClusteredDataGenerator cdg(seed); -void unit(bool deltacode, vector> &allcodecs, const uint32_t S , int seed) { - const uint32_t N = 1U << S; - ClusteredDataGenerator cdg(seed); - - vector datas; - uint32_t NUMBER = 1;// Increase as needed - for (uint32_t gap = 1; gap + S <= 31; gap += 1) { - for (uint32_t T = 0; T < NUMBER; ++T) { - dataarray X; - X.data = cdg.generateClustered(N, 1U << (gap + S)); + vector datas; + uint32_t NUMBER = 1; // Increase as needed + for (uint32_t gap = 1; gap + S <= 31; gap += 1) { + for (uint32_t T = 0; T < NUMBER; ++T) { + dataarray X; + X.data = cdg.generateClustered(N, 1U << (gap + S)); - if (deltacode) delta(0u, X.data.data(), X.data.size()); - - ostringstream convert; - convert << gap; - X.name = convert.str(); - datas.push_back(X); - } + if (deltacode) + delta(0u, X.data.data(), X.data.size()); + ostringstream convert; + convert << gap; + X.name = convert.str(); + datas.push_back(X); } - vector compressedbuffer; - compressedbuffer.resize(N * 2); - vector recoverybuffer; - recoverybuffer.resize(N); - for (auto i : allcodecs) - sillyunittest(datas, compressedbuffer, recoverybuffer, *i); + } + vector compressedbuffer; + compressedbuffer.resize(N * 2); + vector recoverybuffer; + recoverybuffer.resize(N); + for (auto i : allcodecs) + sillyunittest(datas, compressedbuffer, recoverybuffer, *i); } - - int test1(intersectionfunction f, bool testwriteback) { - const uint32_t firstpost[13] = {27181,35350,39241,39277,39278,44682,64706,120447,120450,159274,159290,173895,173942, - }; - const uint32_t secondpost[13] = {25369,28789,28790,28792,28794,28797,37750,42317,68797,68877,68881,68990,85488}; - vector < uint32_t > inter(13); - size_t s = f(firstpost, 13, secondpost, 13, inter.data()); - inter.resize(s); - vector < uint32_t > correct(13); - size_t cs = classicalintersection(firstpost, 13, secondpost, 13, - correct.data()); - correct.resize(cs); - if (inter != correct) { - cout << inter.size() << " " << correct.size() << endl; - for (size_t i = 0; (i < inter.size()) && (i < correct.size()); ++i) - cout << i << " " << inter[i] << " " << correct[i] << endl; - return 1; - } - if (!testwriteback) - return 0; - vector < uint32_t > inter2(firstpost, firstpost + 13); - size_t s2 = f(inter2.data(), 13, secondpost, 13, inter2.data()); - inter2.resize(s2); - if (inter2 != correct) - return 2; + const uint32_t firstpost[13] = { + 27181, 35350, 39241, 39277, 39278, 44682, 64706, + 120447, 120450, 159274, 159290, 173895, 173942, + }; + const uint32_t secondpost[13] = {25369, 28789, 28790, 28792, 28794, + 28797, 37750, 42317, 68797, 68877, + 68881, 68990, 85488}; + vector inter(13); + size_t s = f(firstpost, 13, secondpost, 13, inter.data()); + inter.resize(s); + vector correct(13); + size_t cs = + classicalintersection(firstpost, 13, secondpost, 13, correct.data()); + correct.resize(cs); + if (inter != correct) { + cout << inter.size() << " " << correct.size() << endl; + for (size_t i = 0; (i < inter.size()) && (i < correct.size()); ++i) + cout << i << " " << inter[i] << " " << correct[i] << endl; + return 1; + } + if (!testwriteback) return 0; - + vector inter2(firstpost, firstpost + 13); + size_t s2 = f(inter2.data(), 13, secondpost, 13, inter2.data()); + inter2.resize(s2); + if (inter2 != correct) + return 2; + return 0; } int test2(intersectionfunction f) { - const uint32_t firstpost[5] = { 12635, 12921, 12923, 12924, - 12926 - }; - - const uint32_t secondpost[173] = { 3756, 11996, 12044, 12049, 12109, 12128, - 12131, 12141, 12142, 12150, 12154, 12160, 12167, 12168, 12172, - 12177, 12201, 12208, 12215, 12216, 12223, 12228, 12232, 12233, - 12234, 12235, 12236, 12240, 12241, 12242, 12243, 12254, 12255, - 12256, 12257, 12259, 12260, 12261, 12262, 12264, 12265, 12266, - 12275, 12295, 12471, 12482, 12486, 12508, 12509, 12510, 12511, - 12512, 12530, 12536, 12572, 12573, 12589, 12607, 12609, 12611, - 12630, 12631, 12632, 12633, 12634, 12635, 12636, 12653, 12655, - 12657, 12668, 12672, 12685, 12702, 12716, 12721, 12741, 12745, - 12750, 12755, 12757, 12761, 12765, 12767, 12768, 12794, 12802, - 12803, 12823, 12842, 12851, 12871, 12891, 12893, 12894, 12895, - 12896, 12897, 12915, 12917, 12918, 12919, 12920, 12921, 12922, - 12923, 12924, 12925, 12927, 12929, 12932, 12933, 12934, 12935, - 12936, 12937, 12938, 12939, 12942, 12946, 12951, 12955, 12963, - 12972, 13011, 13013, 13014, 13015, 13017, 13032, 13033, 13036, - 13042, 13050, 13051, 13052, 13057, 13058, 13060, 13090, 13120, - 13132, 13136, 13147, 13185, 13191, 13192, 13193, 13194, 13195, - 13198, 13202, 13205, 13219, 13228, 13230, 13232, 13233, 13238, - 13240, 13246, 13248, 13277, 13278, 13281, 13282, 13283, 13284, - 13291, 13320, 13338, 13346, 13347 - }; - vector < uint32_t > inter(173); - size_t s = f(firstpost, 5, secondpost, 173, inter.data()); - inter.resize(s); - vector < uint32_t > correct(173); - size_t cs = classicalintersection(firstpost, 5, secondpost, 173, - correct.data()); - correct.resize(cs); - if (inter != correct) { - cout << inter.size() << " " << correct.size() << endl; - cout<<" correct answer:"< inter(173); + size_t s = f(firstpost, 5, secondpost, 173, inter.data()); + inter.resize(s); + vector correct(173); + size_t cs = + classicalintersection(firstpost, 5, secondpost, 173, correct.data()); + correct.resize(cs); + if (inter != correct) { + cout << inter.size() << " " << correct.size() << endl; + cout << " correct answer:" << endl; + for (size_t i = 0; i < correct.size(); ++i) + cout << i << " " << correct[i] << endl; + cout << " bad answer:" << endl; + for (size_t i = 0; i < inter.size(); ++i) + cout << i << " " << inter[i] << endl; + return 1; + } + return 0; } - int test3(intersectionfunction f) { - vector firstpost; - vector secondpost; - vector trueinter; + vector firstpost; + vector secondpost; + vector trueinter; - for(uint32_t i = 10; i < 31; ++i) { - firstpost.push_back((1U< inter(firstpost.size()); - size_t s = f(firstpost.data(), firstpost.size(), secondpost.data(), secondpost.size(), inter.data()); - inter.resize(s); - if(inter != trueinter) { - cout << inter.size() << " " << trueinter.size() << endl; - for (size_t i = 0; (i < inter.size()) && (i < trueinter.size()); ++i) - cout << i << " " << inter[i] << " " << trueinter[i] << endl; - return 1; - - return 1; + for (uint32_t i = 10; i < 31; ++i) { + firstpost.push_back((1U << i) | 3U); + trueinter.push_back((1U << i) | 3U); + for (uint32_t j = 3; j < 1000; j += 11) { + secondpost.push_back((1U << i) | j); } - return 0; + firstpost.push_back((1U << i) | 1001U); + } + vector inter(firstpost.size()); + size_t s = f(firstpost.data(), firstpost.size(), secondpost.data(), + secondpost.size(), inter.data()); + inter.resize(s); + if (inter != trueinter) { + cout << inter.size() << " " << trueinter.size() << endl; + for (size_t i = 0; (i < inter.size()) && (i < trueinter.size()); ++i) + cout << i << " " << inter[i] << " " << trueinter[i] << endl; + return 1; + + return 1; + } + return 0; } - - void tellmeaboutmachine() { - cout << "number of bytes in ostream::pos_type = " - << sizeof(ostream::pos_type) << endl; - cout << "number of bytes in size_t = " << sizeof(size_t) << endl; - cout << "number of bytes in int = " << sizeof(int) << endl; - cout << "number of bytes in long = " << sizeof(long) << endl; -#if __LITTLE_ENDIAN__ - cout << "you have little endian machine" << endl; + cout << "number of bytes in ostream::pos_type = " << sizeof(ostream::pos_type) + << endl; + cout << "number of bytes in size_t = " << sizeof(size_t) << endl; + cout << "number of bytes in int = " << sizeof(int) << endl; + cout << "number of bytes in long = " << sizeof(long) << endl; +#if __LITTLE_ENDIAN__ + cout << "you have little endian machine" << endl; #endif -#if __BIG_ENDIAN__ - cout << "you have a big endian machine" << endl; +#if __BIG_ENDIAN__ + cout << "you have a big endian machine" << endl; #endif #if __CHAR_BIT__ - if (__CHAR_BIT__ != 8) - cout << "on your machine, chars don't have 8bits???" << endl; + if (__CHAR_BIT__ != 8) + cout << "on your machine, chars don't have 8bits???" << endl; #endif #if __GNUG__ - cout << "GNU GCC compiler detected." << endl; + cout << "GNU GCC compiler detected." << endl; #else - cout << "Non-GCC compiler." << endl; + cout << "Non-GCC compiler." << endl; #endif } -template -void testFindSimple() { - T codec; - const int max = 256; - uint32_t ints[max]; - for (int i = 0; i < max; i++) - ints[i] = i; - - // encode in a buffer - vector compressedbuffer(max * sizeof(uint32_t) + 1024); - size_t nvalue = compressedbuffer.size(); - codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); - - uint32_t k = 0; - for (int i = 0; i < max; i++) { - size_t pos = codec.findLowerBound(compressedbuffer.data(), max, i, &k); - if (k != (uint32_t)i && pos != static_cast(i)) { - cout << codec.name() << "::findLowerBoundDelta failed with " - << i << endl; - throw std::logic_error("bug"); - } +template void testFindSimple() { + T codec; + const int max = 256; + uint32_t ints[max]; + for (int i = 0; i < max; i++) + ints[i] = i; + + // encode in a buffer + vector compressedbuffer(max * sizeof(uint32_t) + 1024); + size_t nvalue = compressedbuffer.size(); + codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); + + uint32_t k = 0; + for (int i = 0; i < max; i++) { + size_t pos = codec.findLowerBound(compressedbuffer.data(), max, i, &k); + if (k != (uint32_t)i && pos != static_cast(i)) { + cout << codec.name() << "::findLowerBoundDelta failed with " << i << endl; + throw std::logic_error("bug"); } + } - cout << codec.name() << "::findLowerBoundDelta ok" << endl; + cout << codec.name() << "::findLowerBoundDelta ok" << endl; } -template -void testFindAdvanced() { - T codec; - const int max = 2944; - uint32_t ints[max]; - // print random seed to make the test reproducable - time_t t = ::time(0); - cout << "Seed is " << t << endl; - ::srand(static_cast(t)); - - // initialize - for (int i = 0; i < max; i++) - ints[i] = 1 + i * 2; - - // encode in a buffer - vector compressedbuffer(max * sizeof(uint32_t) + 1024); - size_t nvalue = compressedbuffer.size(); - codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); - - uint32_t k1 = 0; - uint32_t k2 = 0; - for (uint32_t i = 0; i < max * 2; i++) { - size_t pos1 = codec.findLowerBound(compressedbuffer.data(), max, i, &k1); - uint32_t *it = std::lower_bound(&ints[0], &ints[max], i); - size_t pos2 = static_cast(it - &ints[0]); - k2 = *it; - // key not found? - if (it == &ints[max] && pos1 != max) { - cout << codec.name() << "::findLowerBoundDelta failed at line " - << __LINE__ << " (i = " << i << ")" << endl; - throw std::logic_error("bug"); - } - // otherwise make sure both results are equal - if (k1 != k2) { - cout << codec.name() << "::findLowerBoundDelta failed at line " - << __LINE__ << " (i = " << i << ")" << endl; - throw std::logic_error("bug"); - } - if (pos1 != pos2) { - cout << codec.name() << "::findLowerBoundDelta failed at line " - << __LINE__ << " (i = " << i << ")" << endl; - throw std::logic_error("bug"); - } +template void testFindAdvanced() { + T codec; + const int max = 2944; + uint32_t ints[max]; + // print random seed to make the test reproducable + time_t t = ::time(0); + cout << "Seed is " << t << endl; + ::srand(static_cast(t)); + + // initialize + for (int i = 0; i < max; i++) + ints[i] = 1 + i * 2; + + // encode in a buffer + vector compressedbuffer(max * sizeof(uint32_t) + 1024); + size_t nvalue = compressedbuffer.size(); + codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); + + uint32_t k1 = 0; + uint32_t k2 = 0; + for (uint32_t i = 0; i < max * 2; i++) { + size_t pos1 = codec.findLowerBound(compressedbuffer.data(), max, i, &k1); + uint32_t *it = std::lower_bound(&ints[0], &ints[max], i); + size_t pos2 = static_cast(it - &ints[0]); + k2 = *it; + // key not found? + if (it == &ints[max] && pos1 != max) { + cout << codec.name() << "::findLowerBoundDelta failed at line " + << __LINE__ << " (i = " << i << ")" << endl; + throw std::logic_error("bug"); + } + // otherwise make sure both results are equal + if (k1 != k2) { + cout << codec.name() << "::findLowerBoundDelta failed at line " + << __LINE__ << " (i = " << i << ")" << endl; + throw std::logic_error("bug"); } + if (pos1 != pos2) { + cout << codec.name() << "::findLowerBoundDelta failed at line " + << __LINE__ << " (i = " << i << ")" << endl; + throw std::logic_error("bug"); + } + } - cout << codec.name() << "::findLowerBoundDelta ok" << endl; + cout << codec.name() << "::findLowerBoundDelta ok" << endl; } +template void testInsert() { + T codec; + const int max = 256; + srand(0); + uint32_t ints[max]; + for (int i = 0; i < max; i++) + ints[i] = rand(); + // encode in a buffer + vector compressedbuffer(max * sizeof(uint32_t) + 1024); + vector decomp(max); + vector sofar; + + size_t currentsize = 0; + for (int i = 0; i < max; i++) { + currentsize = codec.insert(compressedbuffer.data(), + static_cast(currentsize), ints[i]); + size_t howmany = decomp.size(); + codec.decodeArray(compressedbuffer.data(), currentsize, decomp.data(), + howmany); + sofar.push_back(ints[i]); + std::sort(sofar.begin(), sofar.end()); + if (howmany != sofar.size()) + cout << howmany << " " << sofar.size() << " " << i << endl; + assert(howmany == sofar.size()); + for (size_t j = 0; j < howmany; ++j) { + if (decomp[j] != sofar[j]) { + cout << codec.name() << "::insert failed with i = " << i + << ", j = " << j << endl; + for (size_t j = 0; j < howmany; ++j) { + cout << j << " --> stored: " << decomp[j] << " correct: " << sofar[j] + << endl; + } + for (size_t j = 0; j < currentsize * 4; ++j) { + cout << (uint32_t)(((uint8_t *)compressedbuffer.data())[j]) << " "; + } + cout << endl; + throw std::logic_error("bug"); + } + } + } -template -void testInsert() { - T codec; - const int max = 256; - srand(0); - uint32_t ints[max]; - for (int i = 0; i < max; i++) - ints[i] = rand(); - // encode in a buffer - vector < uint32_t > compressedbuffer(max * sizeof(uint32_t) + 1024); - vector < uint32_t > decomp(max); - vector < uint32_t > sofar; - - size_t currentsize = 0; - for (int i = 0; i < max; i++) { - currentsize = codec.insert(compressedbuffer.data(), static_cast(currentsize), ints[i]); - size_t howmany = decomp.size(); - codec.decodeArray(compressedbuffer.data(),currentsize,decomp.data(),howmany); - sofar.push_back(ints[i]); - std::sort(sofar.begin(),sofar.end()); - if(howmany != sofar.size()) - cout << howmany << " " < stored: " << decomp[j] << " correct: " << sofar[j] << endl; - } - for(size_t j = 0; j < currentsize * 4; ++j) { - cout << (uint32_t)(((uint8_t *) compressedbuffer.data()) [j]) << " "; - } - cout << endl; - - throw std::logic_error("bug"); - } - } - } - - cout << codec.name() << "::insert ok" << endl; + cout << codec.name() << "::insert ok" << endl; } // same as testAppend<>, but uses encodeByteArray/decodeByteArray // to avoid padding -template -void testAppendByteArray() { - T codec; - const int max = 256; - srand(0); - uint32_t ints[max]; - for (int i = 0; i < max; i++) - ints[i] = rand(); - std::sort(&ints[0], &ints[max]); - - // encode in a buffer - vector < uint8_t > compressedbuffer(max * sizeof(uint32_t) + 1024); - vector < uint32_t > decomp(max); - vector < uint32_t > sofar; - - size_t currentsize = 0; - for (int i = 0; i < max; i++) { - currentsize = codec.appendToByteArray(compressedbuffer.data(), currentsize, - i > 0 ? ints[i - 1] : 0, ints[i]); - size_t howmany = decomp.size() * 4; - // we pad if needed - if (codec.name() == "VByteDelta") {// must pad with 1s - for (unsigned int k = 0; k < (currentsize + 3)/4*4 - currentsize; ++k) { - compressedbuffer[currentsize + k] = 0xFF; - } - } else {// regular padding with 0s - for (unsigned int k = 0; k < (currentsize + 3)/4*4 - currentsize; ++k) { - compressedbuffer[currentsize + k] = 0; - } - } - codec.decodeArray((const uint32_t *)compressedbuffer.data(), (currentsize + 3)/4,decomp.data(), howmany); - //codec.decodeFromByteArray(compressedbuffer.data(), currentsize, - // decomp.data(), howmany); - sofar.push_back(ints[i]); - if (howmany != sofar.size()) - cout << howmany << " " << sofar.size() << " " << i < stored: " << decomp[j] - << " correct: " << sofar[j] << endl; - } - cout << endl; - - throw std::logic_error("bug"); - } - } - } - - cout << codec.name() << "::append ok" << endl; -} +template void testAppendByteArray() { + T codec; + const int max = 256; + srand(0); + uint32_t ints[max]; + for (int i = 0; i < max; i++) + ints[i] = rand(); + std::sort(&ints[0], &ints[max]); + + // encode in a buffer + vector compressedbuffer(max * sizeof(uint32_t) + 1024); + vector decomp(max); + vector sofar; + + size_t currentsize = 0; + for (int i = 0; i < max; i++) { + currentsize = codec.appendToByteArray(compressedbuffer.data(), currentsize, + i > 0 ? ints[i - 1] : 0, ints[i]); + size_t howmany = decomp.size() * 4; + // we pad if needed + if (codec.name() == "VByteDelta") { // must pad with 1s + for (unsigned int k = 0; k < (currentsize + 3) / 4 * 4 - currentsize; + ++k) { + compressedbuffer[currentsize + k] = 0xFF; + } + } else { // regular padding with 0s + for (unsigned int k = 0; k < (currentsize + 3) / 4 * 4 - currentsize; + ++k) { + compressedbuffer[currentsize + k] = 0; + } + } + codec.decodeArray((const uint32_t *)compressedbuffer.data(), + (currentsize + 3) / 4, decomp.data(), howmany); + // codec.decodeFromByteArray(compressedbuffer.data(), currentsize, + // decomp.data(), howmany); + sofar.push_back(ints[i]); + if (howmany != sofar.size()) + cout << howmany << " " << sofar.size() << " " << i << endl; + assert(howmany == sofar.size()); + + for (size_t j = 0; j < howmany; ++j) { + if (decomp[j] != sofar[j]) { + cout << codec.name() << "::append failed with i = " << i + << ", j = " << j << endl; + for (size_t j = 0; j < howmany; ++j) { + cout << j << " --> stored: " << decomp[j] << " correct: " << sofar[j] + << endl; + } + cout << endl; + throw std::logic_error("bug"); + } + } + } -template -void testAppend() { - T codec; - const int max = 256; - srand(0); - uint32_t ints[max]; - for (int i = 0; i < max; i++) - ints[i] = rand(); - std::sort(&ints[0], &ints[max]); - - // encode in a buffer - vector < uint32_t > compressedbuffer(max * sizeof(uint32_t) + 1024); - vector < uint32_t > decomp(max); - vector < uint32_t > sofar; - - size_t currentsize = 0; - for (int i = 0; i < max; i++) { - currentsize = codec.append((uint8_t *)compressedbuffer.data(), - currentsize, ints[i]); - size_t howmany = decomp.size() * 4; - codec.decodeArray(compressedbuffer.data(), currentsize, - decomp.data(), howmany); - sofar.push_back(ints[i]); - if (howmany != sofar.size()) - cout << howmany << " " << sofar.size() << " " << i < stored: " << decomp[j] - << " correct: " << sofar[j] << endl; - } - cout << endl; - - throw std::logic_error("bug"); - } - } - } - - cout << codec.name() << "::append ok" << endl; + cout << codec.name() << "::append ok" << endl; } -template -void testSelectSimple() { - T codec; - const int max = 256; - uint32_t ints[max]; - for (int i = 0; i < max; i++) - ints[i] = i; - // encode in a buffer - vector compressedbuffer(max * sizeof(uint32_t) + 1024); - size_t nvalue = compressedbuffer.size(); - codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); - - compressedbuffer.resize(nvalue); - compressedbuffer.shrink_to_fit(); - for (int i = 0; i < max; i++) { - - uint32_t k = codec.select(compressedbuffer.data(), i); - if (k != (uint32_t)i) { - cout << codec.name() << "::select failed with " << i << endl; - throw std::logic_error("bug"); +template void testAppend() { + T codec; + const int max = 256; + srand(0); + uint32_t ints[max]; + for (int i = 0; i < max; i++) + ints[i] = rand(); + std::sort(&ints[0], &ints[max]); + + // encode in a buffer + vector compressedbuffer(max * sizeof(uint32_t) + 1024); + vector decomp(max); + vector sofar; + + size_t currentsize = 0; + for (int i = 0; i < max; i++) { + currentsize = + codec.append((uint8_t *)compressedbuffer.data(), currentsize, ints[i]); + size_t howmany = decomp.size() * 4; + codec.decodeArray(compressedbuffer.data(), currentsize, decomp.data(), + howmany); + sofar.push_back(ints[i]); + if (howmany != sofar.size()) + cout << howmany << " " << sofar.size() << " " << i << endl; + assert(howmany == sofar.size()); + + for (size_t j = 0; j < howmany; ++j) { + if (decomp[j] != sofar[j]) { + cout << codec.name() << "::append failed with i = " << i + << ", j = " << j << endl; + for (size_t j = 0; j < howmany; ++j) { + cout << j << " --> stored: " << decomp[j] << " correct: " << sofar[j] + << endl; } + cout << endl; + + throw std::logic_error("bug"); + } } + } - cout << codec.name() << "::select ok" << endl; + cout << codec.name() << "::append ok" << endl; } - -template -void testSelectSimpleOdd() { - T codec; - const int max = 259; - uint32_t ints[max]; - for (int i = 0; i < max; i++) - ints[i] = i; - - // encode in a buffer - vector compressedbuffer(max * sizeof(uint32_t) + 1024); - size_t nvalue = compressedbuffer.size(); - codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); - compressedbuffer.resize(nvalue); - compressedbuffer.shrink_to_fit(); - for (int i = 0; i < max; i++) { - uint32_t k = codec.select(compressedbuffer.data(), i); - if (k != (uint32_t)i) { - cout << codec.name() << "::odd select failed with " << i << " got back "<< k << endl; - throw std::logic_error("bug"); - } +template void testSelectSimple() { + T codec; + const int max = 256; + uint32_t ints[max]; + for (int i = 0; i < max; i++) + ints[i] = i; + // encode in a buffer + vector compressedbuffer(max * sizeof(uint32_t) + 1024); + size_t nvalue = compressedbuffer.size(); + codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); + + compressedbuffer.resize(nvalue); + compressedbuffer.shrink_to_fit(); + for (int i = 0; i < max; i++) { + + uint32_t k = codec.select(compressedbuffer.data(), i); + if (k != (uint32_t)i) { + cout << codec.name() << "::select failed with " << i << endl; + throw std::logic_error("bug"); } + } - cout << codec.name() << "::select ok" << endl; + cout << codec.name() << "::select ok" << endl; } -template -void testSelectAdvanced() { - T codec; - const int max = 2944; - uint32_t ints[max]; - // print random seed to make the test reproducable - time_t t = ::time(0); - cout << "Seed is " << t << endl; - ::srand(static_cast(t)); - - // fill array with "random" values - for (int i = 0; i < max; i++) - ints[i] = 1 + i * 2; - std::random_shuffle(&ints[0], &ints[max]); - - // encode in a buffer - vector compressedbuffer(max * sizeof(uint32_t) + 1024); - size_t nvalue = compressedbuffer.size(); - codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); - - uint32_t k1, k2; - for (int i = 0; i < max; i++) { - k1 = codec.select(compressedbuffer.data(), i); - k2 = (uint32_t)ints[i]; - - if (k1 != k2) { - cout << "max = " << max << endl; - cout << "got back " << k1 << endl; - - for (int j = 0; j <= i; j++) {//shit - cout<"< void testSelectSimpleOdd() { + T codec; + const int max = 259; + uint32_t ints[max]; + for (int i = 0; i < max; i++) + ints[i] = i; + + // encode in a buffer + vector compressedbuffer(max * sizeof(uint32_t) + 1024); + size_t nvalue = compressedbuffer.size(); + codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); + compressedbuffer.resize(nvalue); + compressedbuffer.shrink_to_fit(); + for (int i = 0; i < max; i++) { + uint32_t k = codec.select(compressedbuffer.data(), i); + if (k != (uint32_t)i) { + cout << codec.name() << "::odd select failed with " << i << " got back " + << k << endl; + throw std::logic_error("bug"); } + } - cout << codec.name() << "::selectDelta ok" << endl; + cout << codec.name() << "::select ok" << endl; } -int main() { - testInsert(); - testInsert>(); - testInsert>(); - testInsert>(); - - testAppendByteArray(); - testAppendByteArray>(); - testAppendByteArray>(); - testAppendByteArray>(); - testAppendByteArray>(); - testAppendByteArray(); - testAppendByteArray(); - testAppendByteArray(); - - testAppend(); - testAppend(); - testAppend(); - - testSelectSimple(); - testSelectSimple(); - testSelectSimple(); - testSelectSimple>(); - testSelectSimple>(); - testSelectSimple>(); - testSelectSimple>(); - testSelectSimple>>(); - testSelectSimple(); - - testSelectSimpleOdd(); - testSelectSimpleOdd(); - testSelectSimpleOdd(); - testSelectSimpleOdd>(); - testSelectSimpleOdd>(); - testSelectSimpleOdd>(); - testSelectSimpleOdd>(); - testSelectSimpleOdd(); - - testSelectAdvanced(); - testSelectAdvanced(); - testSelectAdvanced(); - testSelectAdvanced>(); - testSelectAdvanced>(); - testSelectAdvanced>(); - testSelectAdvanced>(); - testSelectAdvanced>>(); - testSelectAdvanced(); - - testFindSimple(); - testFindSimple(); - testFindSimple(); - testFindSimple>(); - testFindSimple>(); - testFindSimple>(); - testFindSimple>(); - testFindSimple>>(); - testFindSimple(); - - testFindAdvanced(); - testFindAdvanced(); - testFindAdvanced(); - testFindAdvanced>(); - testFindAdvanced>(); - testFindAdvanced>(); - testFindAdvanced>(); - testFindAdvanced>>(); - testFindAdvanced(); - - for (string n : IntersectionFactory::allNames()) { - int error = 0; - intersectionfunction interfnc = IntersectionFactory::getFromName( - n); - cout<<"testing "< void testSelectAdvanced() { + T codec; + const int max = 2944; + uint32_t ints[max]; + // print random seed to make the test reproducable + time_t t = ::time(0); + cout << "Seed is " << t << endl; + ::srand(static_cast(t)); + + // fill array with "random" values + for (int i = 0; i < max; i++) + ints[i] = 1 + i * 2; + std::random_shuffle(&ints[0], &ints[max]); + + // encode in a buffer + vector compressedbuffer(max * sizeof(uint32_t) + 1024); + size_t nvalue = compressedbuffer.size(); + codec.encodeArray(ints, max, compressedbuffer.data(), nvalue); + + uint32_t k1, k2; + for (int i = 0; i < max; i++) { + k1 = codec.select(compressedbuffer.data(), i); + k2 = (uint32_t)ints[i]; + + if (k1 != k2) { + cout << "max = " << max << endl; + cout << "got back " << k1 << endl; + + for (int j = 0; j <= i; j++) { // shit + cout << j << "-->" << ints[j] << " " << endl; + } + + cout << codec.name() << "::selectDelta failed at line " << __LINE__ + << " (i = " << i << ")" << endl; + throw std::logic_error("bug"); } + } - vector> allcodecs = CODECFactory::allSchemes(); + cout << codec.name() << "::selectDelta ok" << endl; +} - testSmall(allcodecs); +int main() { + testInsert(); + testInsert>(); + testInsert>(); + testInsert>(); + + testAppendByteArray(); + testAppendByteArray>(); + testAppendByteArray>(); + testAppendByteArray>(); + testAppendByteArray>(); + testAppendByteArray(); + testAppendByteArray(); + testAppendByteArray(); + + testAppend(); + testAppend(); + testAppend(); + + testSelectSimple(); + testSelectSimple(); + testSelectSimple(); + testSelectSimple>(); + testSelectSimple>(); + testSelectSimple>(); + testSelectSimple>(); + testSelectSimple< + SIMDBinaryPacking>>(); + testSelectSimple(); + + testSelectSimpleOdd(); + testSelectSimpleOdd(); + testSelectSimpleOdd(); + testSelectSimpleOdd>(); + testSelectSimpleOdd>(); + testSelectSimpleOdd>(); + testSelectSimpleOdd>(); + testSelectSimpleOdd(); + + testSelectAdvanced(); + testSelectAdvanced(); + testSelectAdvanced(); + testSelectAdvanced>(); + testSelectAdvanced>(); + testSelectAdvanced>(); + testSelectAdvanced>(); + testSelectAdvanced< + SIMDBinaryPacking>>(); + testSelectAdvanced(); + + testFindSimple(); + testFindSimple(); + testFindSimple(); + testFindSimple>(); + testFindSimple>(); + testFindSimple>(); + testFindSimple>(); + testFindSimple< + SIMDBinaryPacking>>(); + testFindSimple(); + + testFindAdvanced(); + testFindAdvanced(); + testFindAdvanced(); + testFindAdvanced>(); + testFindAdvanced>(); + testFindAdvanced>(); + testFindAdvanced>(); + testFindAdvanced< + SIMDBinaryPacking>>(); + testFindAdvanced(); + + for (string n : IntersectionFactory::allNames()) { + int error = 0; + intersectionfunction interfnc = IntersectionFactory::getFromName(n); + cout << "testing " << n << " ... "; + cout.flush(); + int code; + if ((code = test1(interfnc, false)) == 0) + cout << "ok "; + else { + cout << " Error" << code << " "; + ++error; + } + if ((code = test2(interfnc)) == 0) + cout << "ok "; + else { + cout << " Error" << code << endl; + ++error; + } + if ((code = test3(interfnc)) == 0) + cout << "ok" << endl; + else { + cout << " Error" << code << endl; + ++error; + } + assert(error == 0); + } + vector> allcodecs = CODECFactory::allSchemes(); - for (int k = 0; k < 10; ++k) { - cout << k << " "; - cout.flush(); - unittrivial(true, allcodecs, 10); - unittrivial(false, allcodecs, 10); - unit(true, allcodecs, 10, 1374809627); - unit(false, allcodecs, 10, 1374809627); - } - cout << endl; + testSmall(allcodecs); - for (int k = 0; k < 10; ++k) { - cout << k << " "; - cout.flush(); - unit(true, allcodecs, 14, 1374809627); - unit(false, allcodecs, 14, 1374809627); - } - cout << endl; + for (int k = 0; k < 10; ++k) { + cout << k << " "; + cout.flush(); + unittrivial(true, allcodecs, 10); + unittrivial(false, allcodecs, 10); + unit(true, allcodecs, 10, 1374809627); + unit(false, allcodecs, 10, 1374809627); + } + cout << endl; + + for (int k = 0; k < 10; ++k) { + cout << k << " "; + cout.flush(); + unit(true, allcodecs, 14, 1374809627); + unit(false, allcodecs, 14, 1374809627); + } + cout << endl; - for (int k = 0; k < 100; ++k) { - cout << k << " "; - cout.flush(); - unit(false, allcodecs, 7, 1374809652 + k); - unit(true, allcodecs, 7, 1374809652 + k); - } - cout << endl; + for (int k = 0; k < 100; ++k) { + cout << k << " "; + cout.flush(); + unit(false, allcodecs, 7, 1374809652 + k); + unit(true, allcodecs, 7, 1374809652 + k); + } + cout << endl; - tellmeaboutmachine(); + tellmeaboutmachine(); - cout << "Code is probably ok." << endl; + cout << "Code is probably ok." << endl; } diff --git a/src/usimdbitpacking.cpp b/src/usimdbitpacking.cpp index a8b09ae..78a95d9 100644 --- a/src/usimdbitpacking.cpp +++ b/src/usimdbitpacking.cpp @@ -10,13790 +10,13871 @@ namespace SIMDCompressionLib { using namespace std; /** - * This is generated from simdbitpacking.cpp by replacing _mm_load_si128 with _mm_loadu_si128 + * This is generated from simdbitpacking.cpp by replacing _mm_load_si128 with + * _mm_loadu_si128 * and _mm_storeu_si128 by _mm_storeu_si128. */ -void __uSIMD_fastpackwithoutmask0(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - (void)_in; - (void)out; +void __uSIMD_fastpackwithoutmask0(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + (void)_in; + (void)out; } -void __uSIMD_fastpackwithoutmask1(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); +void __uSIMD_fastpackwithoutmask1(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); +} +void __uSIMD_fastpackwithoutmask2(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); -} + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastpackwithoutmask2(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask3(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastpackwithoutmask3(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask5(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastpackwithoutmask5(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask6(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastpackwithoutmask6(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask7(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastpackwithoutmask7(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask9(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastpackwithoutmask9(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask10(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastpackwithoutmask10(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask11(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); } +void __uSIMD_fastpackwithoutmask12(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastpackwithoutmask11(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask13(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); +} +void __uSIMD_fastpackwithoutmask14(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); } +void __uSIMD_fastpackwithoutmask15(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); +} +void __uSIMD_fastpackwithoutmask17(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); +} -void __uSIMD_fastpackwithoutmask12(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); +void __uSIMD_fastpackwithoutmask18(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask19(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask20(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask21(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask22(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask23(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask24(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask25(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask26(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask27(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask28(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask29(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask30(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask31(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask32(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask4(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; + for (uint32_t outer = 0; outer < 4; ++outer) { + InReg = _mm_loadu_si128(in); OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 1); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 2); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 3); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 4); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(in + 7); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); _mm_storeu_si128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - + in += 8; + } } +void __uSIMD_fastpackwithoutmask8(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; - -void __uSIMD_fastpackwithoutmask13(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - + for (uint32_t outer = 0; outer < 8; ++outer) { + InReg = _mm_loadu_si128(in); OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 1); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 2); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + InReg = _mm_loadu_si128(in + 3); OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); _mm_storeu_si128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); + in += 4; + } +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpackwithoutmask16(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + for (uint32_t outer = 0; outer < 16; ++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(in + 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); _mm_storeu_si128(out, OutReg); ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - + in += 2; + } } +void __uSIMD_fastpack0(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + (void)_in; + (void)out; +} +void __uSIMD_fastpack1(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; -void __uSIMD_fastpackwithoutmask14(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 1) - 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); } +void __uSIMD_fastpack2(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); -void __uSIMD_fastpackwithoutmask15(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); } +void __uSIMD_fastpack3(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); -void __uSIMD_fastpackwithoutmask17(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); } +void __uSIMD_fastpack5(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); -void __uSIMD_fastpackwithoutmask18(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask19(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask20(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask21(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask22(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask23(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask24(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask25(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask26(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask27(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask28(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask29(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask30(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask31(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask32(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpackwithoutmask4(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg; - - for (uint32_t outer = 0; outer < 4 ; ++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in + 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - - InReg = _mm_loadu_si128(in + 2); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = _mm_loadu_si128(in + 3); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - - InReg = _mm_loadu_si128(in + 4); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = _mm_loadu_si128(in + 5); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - - InReg = _mm_loadu_si128(in + 6); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - - InReg = _mm_loadu_si128(in + 7); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - - in += 8; - } - -} - - - -void __uSIMD_fastpackwithoutmask8(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg; - - for (uint32_t outer = 0; outer < 8 ; ++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in + 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = _mm_loadu_si128(in + 2); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = _mm_loadu_si128(in + 3); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - - in += 4; - } - -} - - - -void __uSIMD_fastpackwithoutmask16(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - __m128i InReg; - - for (uint32_t outer = 0; outer < 16 ; ++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in + 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - - in += 2; - } - -} - - -void __uSIMD_fastpack0(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - (void)_in; - (void)out; -} - -void __uSIMD_fastpack1(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 1) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack2(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 2) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack3(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 3) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack5(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 5) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack6(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 6) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack7(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack9(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 9) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack10(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 10) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack11(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack12(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 12) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack13(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 13) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack14(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 14) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack15(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 15) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack17(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 17) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack18(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 18) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack19(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 19) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack20(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 20) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack21(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 21) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack22(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 22) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack23(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 23) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack24(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 24) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack25(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 25) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack26(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 26) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack27(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 27) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack28(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 28) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack29(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 29) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack30(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 30) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - -void __uSIMD_fastpack31(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U << 31) - 1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); +void __uSIMD_fastpack6(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -void __uSIMD_fastpack32(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg; + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); +} - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpack7(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -void __uSIMD_fastpack4(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U << 4) - 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - for (uint32_t outer = 0; outer < 4 ; ++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 2), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 3), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 4), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 5), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 6), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 7), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - in += 8; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); } +void __uSIMD_fastpack9(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); -void __uSIMD_fastpack8(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U << 8) - 1); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - for (uint32_t outer = 0; outer < 8 ; ++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 2), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 3), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - in += 4; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -void __uSIMD_fastpack16(const uint32_t *__restrict__ _in, __m128i *__restrict__ out) { - const __m128i *in = reinterpret_cast(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U << 16) - 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - for (uint32_t outer = 0; outer < 16 ; ++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - in += 2; - } + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -void __uSIMD_fastunpack1(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg1 = _mm_loadu_si128(in); - __m128i InReg2 = InReg1; - __m128i OutReg1, OutReg2, OutReg3, OutReg4; - const __m128i mask = _mm_set1_epi32(1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - unsigned shift = 0; + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - for (unsigned i = 0; i < 8; ++i) { - OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++) , mask); - OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++) , mask); - OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++) , mask); - OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++) , mask); - _mm_storeu_si128(out++, OutReg1); - _mm_storeu_si128(out++, OutReg2); - _mm_storeu_si128(out++, OutReg3); - _mm_storeu_si128(out++, OutReg4); - } -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -void __uSIMD_fastunpack2(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 2) - 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack10(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 30) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -void __uSIMD_fastunpack3(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); +} + +void __uSIMD_fastpack11(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 3) - 1); + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 27) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack12(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = _mm_srli_epi32(InReg, 29) ; - _mm_storeu_si128(out++, OutReg); + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -} + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); -void __uSIMD_fastunpack4(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 4) - 1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - _mm_storeu_si128(out++, OutReg); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); +} + +void __uSIMD_fastpack13(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); - _mm_storeu_si128(out++, OutReg); + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack14(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack15(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack17(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack18(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 28) ; - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack19(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); +} +void __uSIMD_fastpack20(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); +} +void __uSIMD_fastpack21(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); } +void __uSIMD_fastpack22(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); +} +void __uSIMD_fastpack23(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); +} +void __uSIMD_fastpack24(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); +} -void __uSIMD_fastunpack5(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { +void __uSIMD_fastpack25(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); +} - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 5) - 1); +void __uSIMD_fastpack26(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack27(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack28(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack29(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack30(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack31(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack32(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastpack4(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + for (uint32_t outer = 0; outer < 4; ++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + InReg = _mm_and_si128(_mm_loadu_si128(in + 6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + in += 8; + } +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack8(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); + for (uint32_t outer = 0; outer < 8; ++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + in += 4; + } +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastpack16(const uint32_t *__restrict__ _in, + __m128i *__restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + for (uint32_t outer = 0; outer < 16; ++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + InReg = _mm_and_si128(_mm_loadu_si128(in + 1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + in += 2; + } +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack1(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg1 = _mm_loadu_si128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + unsigned shift = 0; + + for (unsigned i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, shift++), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, shift++), mask); + _mm_storeu_si128(out++, OutReg1); + _mm_storeu_si128(out++, OutReg2); + _mm_storeu_si128(out++, OutReg3); + _mm_storeu_si128(out++, OutReg4); + } +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack2(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 2) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack6(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 6) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack3(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 3) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 27), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack7(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 28), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 7) - 1); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack4(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 4) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); -} + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack8(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 8) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastunpack5(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 5) - 1); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 25), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 26), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + _mm_storeu_si128(out++, OutReg); } +void __uSIMD_fastunpack6(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 6) - 1); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack9(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 9) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack7(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 7) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 3), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack10(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 24), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 10) - 1); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 23), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack8(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 8) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack11(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 11) - 1); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack9(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 9) - 1); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 22), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 21), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 2), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack12(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 12) - 1); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastunpack10(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 10) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 6), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastunpack13(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 13) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack11(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 11) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 19), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 19) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 9), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 20), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11 - 10), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack14(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 21); + _mm_storeu_si128(out++, OutReg); +} - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 14) - 1); +void __uSIMD_fastunpack12(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 12) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 18) ; - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack13(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 13) - 1); -} + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 7), mask)); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack15(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 15) - 1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 17), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 18), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack14(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 14) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack16(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 16) - 1); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + _mm_storeu_si128(out++, OutReg); +} - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack15(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 15) - 1); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 15), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 7), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 5), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastunpack17(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 17) - 1); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 16), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack16(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 16) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack18(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 18) - 1); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack17(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 17) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 14), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 3), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 5), mask)); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack19(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 19) - 1); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 13), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack18(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 18) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 13) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 12), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 16), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 2), mask)); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack20(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 20) - 1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack19(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 19) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 12), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 11), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 14), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 1), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 7), mask)); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack21(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + _mm_storeu_si128(out++, OutReg); +} + +void __uSIMD_fastunpack20(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 20) - 1); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20 - 12), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + _mm_storeu_si128(out++, OutReg); } +void __uSIMD_fastunpack21(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 21) - 1); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack22(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 22) - 1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 10), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 9), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 11); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack22(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 22) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastunpack23(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 23) - 1); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastunpack23(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 23) - 1); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 9) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 19), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 10), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 1), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack24(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 24) - 1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 7), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 21), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 8), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 9); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack24(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 24) - 1); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack25(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 25) - 1); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 9) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack25(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 25) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 7) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 12), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 5), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 5), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastunpack26(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 23), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 26) - 1); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 6), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 21), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack26(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 26) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 6) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastunpack27(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 27) - 1); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 7) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 9) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 6); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack27(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 27) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 4), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 21), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 5) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 6), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 1), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastunpack28(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 23), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 28) - 1); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 3), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 25), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 5); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack28(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 28) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack29(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 29) - 1); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 4); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack29(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 29) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 5) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 23), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 5); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 2), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 7) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 28), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 25), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1) , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 9) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 1), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(_mm_srli_epi32(InReg, 1), mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 27), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 3) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 24), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 21), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 18), mask)); + _mm_storeu_si128(out++, OutReg); -void __uSIMD_fastunpack30(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 30) - 1); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29 - 3), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 3); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); +void __uSIMD_fastunpack30(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 30) - 1); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 2) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 2); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 28), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_srli_epi32(InReg, 2) ; - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 12), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); -} + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 10), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 8), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); -void __uSIMD_fastunpack31(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - __m128i *out = reinterpret_cast<__m128i *>(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U << 31) - 1); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_and_si128(InReg , mask); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 31) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30 - 2), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 30) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 2); + _mm_storeu_si128(out++, OutReg); +} - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); - _mm_storeu_si128(out++, OutReg); +void __uSIMD_fastunpack31(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { - OutReg = _mm_srli_epi32(InReg, 29) ; - InReg = _mm_loadu_si128(++in); + __m128i *out = reinterpret_cast<__m128i *>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U << 31) - 1); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_and_si128(InReg, mask); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 28) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 31); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 30), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 27) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 30); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 29), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 26) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 29); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 28), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 25) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 28); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 27), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 24) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 27); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 26), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 23) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 26); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 25), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 22) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 25); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 24), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 21) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 24); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 23), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 20) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 23); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 22), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 19) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 22); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 21), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 18) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 21); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 20), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 17) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 20); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 19), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 16) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 19); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 18), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 15) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 18); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 17), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 14) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 17); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 16), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 13) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 16); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 15), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 12) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 15); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 14), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 11) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 14); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 13), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 10) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 13); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 12), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 9) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 12); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 11), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 8) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 11); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 10), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 7) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 10); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 9), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 6) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 9); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 8), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 5) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 8); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 7), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 4) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 7); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 6), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 3) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 6); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 5), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 2) ; - InReg = _mm_loadu_si128(++in); + OutReg = _mm_srli_epi32(InReg, 5); + InReg = _mm_loadu_si128(++in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); - _mm_storeu_si128(out++, OutReg); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 4), mask)); + _mm_storeu_si128(out++, OutReg); - OutReg = _mm_srli_epi32(InReg, 1) ; - _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 4); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 3), mask)); + _mm_storeu_si128(out++, OutReg); -} + OutReg = _mm_srli_epi32(InReg, 3); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 2), mask)); + _mm_storeu_si128(out++, OutReg); + OutReg = _mm_srli_epi32(InReg, 2); + InReg = _mm_loadu_si128(++in); + OutReg = + _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31 - 1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg, 1); + _mm_storeu_si128(out++, OutReg); +} -void __uSIMD_fastunpack32(const __m128i *__restrict__ in, uint32_t *__restrict__ _out) { - __m128i *out = reinterpret_cast<__m128i *>(_out); - for (uint32_t outer = 0; outer < 32 ; ++outer) { - _mm_storeu_si128(out++, _mm_loadu_si128(in++)); - } +void __uSIMD_fastunpack32(const __m128i *__restrict__ in, + uint32_t *__restrict__ _out) { + __m128i *out = reinterpret_cast<__m128i *>(_out); + for (uint32_t outer = 0; outer < 32; ++outer) { + _mm_storeu_si128(out++, _mm_loadu_si128(in++)); + } } } // namespace SIMDCompressionLib diff --git a/src/varintdecode.c b/src/varintdecode.c index 89d9192..ee52f68 100644 --- a/src/varintdecode.c +++ b/src/varintdecode.c @@ -6,2258 +6,2230 @@ #endif #include "platform.h" -static int read_int(const uint8_t* in, uint32_t* out) { - *out = in[0] & 0x7F; - if (in[0] < 128) { - return 1; - } - *out = ((in[1] & 0x7FU) << 7) | *out; - if (in[1] < 128) { - return 2; - } - *out = ((in[2] & 0x7FU) << 14) | *out; - if (in[2] < 128) { - return 3; - } - *out = ((in[3] & 0x7FU) << 21) | *out; - if (in[3] < 128) { - return 4; - } - *out = ((in[4] & 0x7FU) << 28) | *out; - return 5; +static int read_int(const uint8_t *in, uint32_t *out) { + *out = in[0] & 0x7F; + if (in[0] < 128) { + return 1; + } + *out = ((in[1] & 0x7FU) << 7) | *out; + if (in[1] < 128) { + return 2; + } + *out = ((in[2] & 0x7FU) << 14) | *out; + if (in[2] < 128) { + return 3; + } + *out = ((in[3] & 0x7FU) << 21) | *out; + if (in[3] < 128) { + return 4; + } + *out = ((in[4] & 0x7FU) << 28) | *out; + return 5; } -static inline int read_int_delta(const uint8_t* in, uint32_t* out, uint32_t* prev) { - *out = in[0] & 0x7F; - if (in[0] < 128) { - *prev += *out; - *out = *prev; - return 1; - } - *out = ((in[1] & 0x7FU) << 7) | *out; - if (in[1] < 128) { - *prev += *out; - *out = *prev; - return 2; - } - *out = ((in[2] & 0x7FU) << 14) | *out; - if (in[2] < 128) { - *prev += *out; - *out = *prev; - return 3; - } - *out = ((in[3] & 0x7FU) << 21) | *out; - if (in[3] < 128) { - *prev += *out; - *out = *prev; - return 4; - } - *out = ((in[4] & 0x7FU) << 28) | *out; +static inline int read_int_delta(const uint8_t *in, uint32_t *out, + uint32_t *prev) { + *out = in[0] & 0x7F; + if (in[0] < 128) { *prev += *out; *out = *prev; - return 5; + return 1; + } + *out = ((in[1] & 0x7FU) << 7) | *out; + if (in[1] < 128) { + *prev += *out; + *out = *prev; + return 2; + } + *out = ((in[2] & 0x7FU) << 14) | *out; + if (in[2] < 128) { + *prev += *out; + *out = *prev; + return 3; + } + *out = ((in[3] & 0x7FU) << 21) | *out; + if (in[3] < 128) { + *prev += *out; + *out = *prev; + return 4; + } + *out = ((in[4] & 0x7FU) << 28) | *out; + *prev += *out; + *out = *prev; + return 5; } - - - typedef struct index_bytes_consumed { - uint8_t index; - uint8_t bytes_consumed; + uint8_t index; + uint8_t bytes_consumed; } index_bytes_consumed; -static SIMDCOMP_ALIGNED(0x1000) index_bytes_consumed combined_lookup[] = { - {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, - {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, - {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, - {56, 9}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, - {119, 7}, {10, 8}, {52, 9}, {83, 7}, {160, 5}, {6, 8}, {44, 9}, - {28, 9}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, - {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, - {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, - {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {50, 9}, - {82, 6}, {160, 5}, {5, 8}, {42, 9}, {26, 9}, {128, 8}, {70, 6}, - {110, 8}, {148, 5}, {165, 6}, {3, 8}, {38, 9}, {22, 9}, {122, 8}, - {14, 9}, {60, 10}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, - {139, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, - {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, - {96, 8}, {78, 8}, {133, 9}, {72, 8}, {115, 9}, {148, 5}, {167, 8}, - {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, - {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, - {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, - {49, 9}, {82, 6}, {160, 5}, {4, 7}, {41, 9}, {25, 9}, {127, 7}, - {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {37, 9}, {21, 9}, - {121, 7}, {13, 9}, {58, 10}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, - {79, 7}, {137, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, - {35, 9}, {19, 9}, {119, 7}, {11, 9}, {54, 10}, {83, 7}, {160, 5}, - {7, 9}, {46, 10}, {30, 10}, {131, 9}, {71, 7}, {113, 9}, {148, 5}, - {166, 7}, {64, 4}, {93, 7}, {75, 7}, {125, 9}, {69, 7}, {107, 9}, - {89, 9}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, - {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, - {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, - {129, 9}, {70, 6}, {111, 9}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, - {74, 6}, {123, 9}, {68, 6}, {105, 9}, {87, 9}, {161, 6}, {66, 6}, - {99, 9}, {81, 9}, {142, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, - {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, - {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, - {148, 5}, {168, 9}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, - {151, 4}, {147, 4}, {164, 9}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, - {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, - {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, - {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, - {36, 8}, {20, 8}, {121, 7}, {12, 8}, {57, 10}, {85, 7}, {161, 6}, - {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {53, 10}, - {83, 7}, {160, 5}, {6, 8}, {45, 10}, {29, 10}, {130, 8}, {71, 7}, - {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, - {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, - {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, - {17, 8}, {118, 6}, {9, 8}, {51, 10}, {82, 6}, {160, 5}, {5, 8}, - {43, 10}, {27, 10}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, - {3, 8}, {39, 10}, {23, 10}, {122, 8}, {15, 10}, {62, 11}, {86, 8}, - {161, 6}, {66, 6}, {98, 8}, {80, 8}, {140, 10}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, - {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {134, 10}, - {72, 8}, {116, 10}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, - {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, - {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, - {32, 7}, {16, 7}, {118, 6}, {8, 7}, {100, 6}, {82, 6}, {160, 5}, - {4, 7}, {94, 6}, {76, 6}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, - {165, 6}, {2, 7}, {92, 6}, {74, 6}, {121, 7}, {68, 6}, {103, 7}, - {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {138, 10}, {145, 2}, - {153, 6}, {149, 6}, {0, 0}, {1, 7}, {91, 5}, {73, 5}, {119, 7}, - {67, 5}, {101, 7}, {83, 7}, {160, 5}, {65, 5}, {95, 7}, {77, 7}, - {132, 10}, {71, 7}, {114, 10}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, - {75, 7}, {126, 10}, {69, 7}, {108, 10}, {90, 10}, {162, 7}, {145, 2}, - {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, - {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, - {160, 5}, {65, 5}, {94, 6}, {76, 6}, {156, 5}, {70, 6}, {152, 5}, - {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, {155, 4}, {68, 6}, - {151, 4}, {147, 4}, {161, 6}, {66, 6}, {150, 3}, {146, 3}, {157, 6}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, - {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, - {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, {169, 10}, {0, 4}, - {0, 3}, {0, 3}, {0, 4}, {0, 2}, {0, 4}, {0, 4}, {0, 0}, - {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, - {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, - {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, - {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, - {12, 8}, {56, 9}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, - {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, - {18, 8}, {119, 7}, {10, 8}, {52, 9}, {83, 7}, {160, 5}, {6, 8}, - {44, 9}, {28, 9}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, - {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, - {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, - {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, - {50, 9}, {82, 6}, {160, 5}, {5, 8}, {42, 9}, {26, 9}, {128, 8}, - {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, {38, 9}, {22, 9}, - {122, 8}, {14, 9}, {61, 11}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, - {80, 8}, {139, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, - {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, - {65, 5}, {96, 8}, {78, 8}, {133, 9}, {72, 8}, {115, 9}, {148, 5}, - {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, - {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, - {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, - {8, 7}, {49, 9}, {82, 6}, {160, 5}, {4, 7}, {41, 9}, {25, 9}, - {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {37, 9}, - {21, 9}, {121, 7}, {13, 9}, {59, 11}, {85, 7}, {161, 6}, {66, 6}, - {97, 7}, {79, 7}, {137, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, - {1, 7}, {35, 9}, {19, 9}, {119, 7}, {11, 9}, {55, 11}, {83, 7}, - {160, 5}, {7, 9}, {47, 11}, {31, 11}, {131, 9}, {71, 7}, {113, 9}, - {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {125, 9}, {69, 7}, - {107, 9}, {89, 9}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, - {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, - {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, - {76, 6}, {129, 9}, {70, 6}, {111, 9}, {148, 5}, {165, 6}, {64, 4}, - {92, 6}, {74, 6}, {123, 9}, {68, 6}, {105, 9}, {87, 9}, {161, 6}, - {66, 6}, {99, 9}, {81, 9}, {143, 11}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, - {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, - {152, 5}, {148, 5}, {168, 9}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, - {145, 2}, {151, 4}, {147, 4}, {164, 9}, {0, 2}, {0, 3}, {0, 3}, - {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, - {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, - {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, - {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, {103, 7}, {85, 7}, - {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, - {101, 7}, {83, 7}, {160, 5}, {6, 8}, {95, 7}, {77, 7}, {130, 8}, - {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, - {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, - {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, - {33, 8}, {17, 8}, {118, 6}, {9, 8}, {100, 6}, {82, 6}, {160, 5}, - {5, 8}, {94, 6}, {76, 6}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, - {165, 6}, {3, 8}, {92, 6}, {74, 6}, {122, 8}, {68, 6}, {104, 8}, - {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, {141, 11}, {145, 2}, - {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, - {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, - {135, 11}, {72, 8}, {117, 11}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, - {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, - {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, - {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {100, 6}, {82, 6}, - {160, 5}, {4, 7}, {94, 6}, {76, 6}, {127, 7}, {70, 6}, {109, 7}, - {148, 5}, {165, 6}, {2, 7}, {92, 6}, {74, 6}, {121, 7}, {68, 6}, - {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {157, 6}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {91, 5}, {73, 5}, - {119, 7}, {67, 5}, {101, 7}, {83, 7}, {160, 5}, {65, 5}, {95, 7}, - {77, 7}, {156, 5}, {71, 7}, {152, 5}, {148, 5}, {166, 7}, {64, 4}, - {93, 7}, {75, 7}, {155, 4}, {69, 7}, {151, 4}, {147, 4}, {162, 7}, - {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, - {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, - {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, {156, 5}, {70, 6}, - {152, 5}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, {155, 4}, - {68, 6}, {151, 4}, {147, 4}, {161, 6}, {66, 6}, {150, 3}, {146, 3}, - {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {0, 4}, {0, 5}, - {0, 5}, {0, 4}, {0, 5}, {0, 4}, {0, 4}, {0, 5}, {0, 5}, - {0, 3}, {0, 3}, {0, 5}, {0, 2}, {0, 5}, {0, 5}, {0, 0}, - {0, 4}, {0, 3}, {0, 3}, {0, 4}, {0, 2}, {0, 4}, {0, 4}, - {0, 0}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, - {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, - {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, - {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, - {121, 7}, {12, 8}, {56, 9}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, - {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, - {34, 8}, {18, 8}, {119, 7}, {10, 8}, {52, 9}, {83, 7}, {160, 5}, - {6, 8}, {44, 9}, {28, 9}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, - {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, - {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, - {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, - {9, 8}, {50, 9}, {82, 6}, {160, 5}, {5, 8}, {42, 9}, {26, 9}, - {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, {38, 9}, - {22, 9}, {122, 8}, {14, 9}, {60, 10}, {86, 8}, {161, 6}, {66, 6}, - {98, 8}, {80, 8}, {139, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, - {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, - {160, 5}, {65, 5}, {96, 8}, {78, 8}, {133, 9}, {72, 8}, {115, 9}, - {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, - {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, - {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, - {118, 6}, {8, 7}, {49, 9}, {82, 6}, {160, 5}, {4, 7}, {41, 9}, - {25, 9}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, - {37, 9}, {21, 9}, {121, 7}, {13, 9}, {58, 10}, {85, 7}, {161, 6}, - {66, 6}, {97, 7}, {79, 7}, {137, 9}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {1, 7}, {35, 9}, {19, 9}, {119, 7}, {11, 9}, {54, 10}, - {83, 7}, {160, 5}, {7, 9}, {46, 10}, {30, 10}, {131, 9}, {71, 7}, - {113, 9}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {125, 9}, - {69, 7}, {107, 9}, {89, 9}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, - {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, - {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, - {94, 6}, {76, 6}, {129, 9}, {70, 6}, {111, 9}, {148, 5}, {165, 6}, - {64, 4}, {92, 6}, {74, 6}, {123, 9}, {68, 6}, {105, 9}, {87, 9}, - {161, 6}, {66, 6}, {99, 9}, {81, 9}, {142, 10}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, - {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, - {145, 2}, {152, 5}, {148, 5}, {168, 9}, {64, 4}, {150, 3}, {146, 3}, - {155, 4}, {145, 2}, {151, 4}, {147, 4}, {164, 9}, {0, 2}, {0, 3}, - {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, - {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, - {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, - {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, {57, 10}, - {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, - {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, - {10, 8}, {53, 10}, {83, 7}, {160, 5}, {6, 8}, {45, 10}, {29, 10}, - {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, - {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, - {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, - {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {51, 10}, {82, 6}, - {160, 5}, {5, 8}, {43, 10}, {27, 10}, {128, 8}, {70, 6}, {110, 8}, - {148, 5}, {165, 6}, {3, 8}, {39, 10}, {23, 10}, {122, 8}, {15, 10}, - {63, 12}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, {140, 10}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, - {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, - {78, 8}, {134, 10}, {72, 8}, {116, 10}, {148, 5}, {167, 8}, {64, 4}, - {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, - {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, - {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {100, 6}, - {82, 6}, {160, 5}, {4, 7}, {94, 6}, {76, 6}, {127, 7}, {70, 6}, - {109, 7}, {148, 5}, {165, 6}, {2, 7}, {92, 6}, {74, 6}, {121, 7}, - {68, 6}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, - {138, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {91, 5}, - {73, 5}, {119, 7}, {67, 5}, {101, 7}, {83, 7}, {160, 5}, {65, 5}, - {95, 7}, {77, 7}, {132, 10}, {71, 7}, {114, 10}, {148, 5}, {166, 7}, - {64, 4}, {93, 7}, {75, 7}, {126, 10}, {69, 7}, {108, 10}, {90, 10}, - {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, - {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, - {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, {156, 5}, - {70, 6}, {152, 5}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, - {155, 4}, {68, 6}, {151, 4}, {147, 4}, {161, 6}, {66, 6}, {150, 3}, - {146, 3}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, - {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, - {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, - {169, 10}, {0, 4}, {0, 3}, {0, 3}, {0, 4}, {0, 2}, {0, 4}, - {0, 4}, {0, 0}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, - {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, - {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, - {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, - {20, 8}, {121, 7}, {12, 8}, {56, 9}, {85, 7}, {161, 6}, {66, 6}, - {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, - {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {52, 9}, {83, 7}, - {160, 5}, {6, 8}, {44, 9}, {28, 9}, {130, 8}, {71, 7}, {112, 8}, - {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, - {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, - {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, - {118, 6}, {9, 8}, {50, 9}, {82, 6}, {160, 5}, {5, 8}, {42, 9}, - {26, 9}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, - {38, 9}, {22, 9}, {122, 8}, {14, 9}, {104, 8}, {86, 8}, {161, 6}, - {66, 6}, {98, 8}, {80, 8}, {139, 9}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, - {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {133, 9}, {72, 8}, - {115, 9}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, - {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, - {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, - {16, 7}, {118, 6}, {8, 7}, {49, 9}, {82, 6}, {160, 5}, {4, 7}, - {41, 9}, {25, 9}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, - {2, 7}, {37, 9}, {21, 9}, {121, 7}, {13, 9}, {103, 7}, {85, 7}, - {161, 6}, {66, 6}, {97, 7}, {79, 7}, {137, 9}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {1, 7}, {35, 9}, {19, 9}, {119, 7}, {11, 9}, - {101, 7}, {83, 7}, {160, 5}, {7, 9}, {95, 7}, {77, 7}, {131, 9}, - {71, 7}, {113, 9}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, - {125, 9}, {69, 7}, {107, 9}, {89, 9}, {162, 7}, {145, 2}, {150, 3}, - {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, - {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, - {65, 5}, {94, 6}, {76, 6}, {129, 9}, {70, 6}, {111, 9}, {148, 5}, - {165, 6}, {64, 4}, {92, 6}, {74, 6}, {123, 9}, {68, 6}, {105, 9}, - {87, 9}, {161, 6}, {66, 6}, {99, 9}, {81, 9}, {144, 12}, {145, 2}, - {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, - {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, - {156, 5}, {145, 2}, {152, 5}, {148, 5}, {168, 9}, {64, 4}, {150, 3}, - {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {164, 9}, {0, 2}, - {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, - {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, - {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, - {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, - {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, - {119, 7}, {10, 8}, {101, 7}, {83, 7}, {160, 5}, {6, 8}, {95, 7}, - {77, 7}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, - {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, - {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, - {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {100, 6}, - {82, 6}, {160, 5}, {5, 8}, {94, 6}, {76, 6}, {128, 8}, {70, 6}, - {110, 8}, {148, 5}, {165, 6}, {3, 8}, {92, 6}, {74, 6}, {122, 8}, - {68, 6}, {104, 8}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, - {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, - {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, - {96, 8}, {78, 8}, {156, 5}, {72, 8}, {152, 5}, {148, 5}, {167, 8}, - {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, - {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, - {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, - {100, 6}, {82, 6}, {160, 5}, {4, 7}, {94, 6}, {76, 6}, {127, 7}, - {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {92, 6}, {74, 6}, - {121, 7}, {68, 6}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, - {79, 7}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, - {91, 5}, {73, 5}, {119, 7}, {67, 5}, {101, 7}, {83, 7}, {160, 5}, - {65, 5}, {95, 7}, {77, 7}, {156, 5}, {71, 7}, {152, 5}, {148, 5}, - {166, 7}, {64, 4}, {93, 7}, {75, 7}, {155, 4}, {69, 7}, {151, 4}, - {147, 4}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, - {154, 7}, {0, 0}, {0, 0}, {0, 6}, {0, 5}, {0, 5}, {0, 6}, - {0, 5}, {0, 6}, {0, 6}, {0, 5}, {0, 5}, {0, 6}, {0, 6}, - {0, 5}, {0, 6}, {0, 5}, {0, 5}, {0, 6}, {0, 4}, {0, 6}, - {0, 6}, {0, 4}, {0, 6}, {0, 4}, {0, 4}, {0, 6}, {0, 6}, - {0, 3}, {0, 3}, {0, 6}, {0, 2}, {0, 6}, {0, 6}, {0, 0}, - {0, 4}, {0, 5}, {0, 5}, {0, 4}, {0, 5}, {0, 4}, {0, 4}, - {0, 5}, {0, 5}, {0, 3}, {0, 3}, {0, 5}, {0, 2}, {0, 5}, - {0, 5}, {0, 0}, {0, 4}, {0, 3}, {0, 3}, {0, 4}, {0, 2}, - {0, 4}, {0, 4}, {0, 0}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, - {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, - {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, - {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, - {36, 8}, {20, 8}, {121, 7}, {12, 8}, {56, 9}, {85, 7}, {161, 6}, - {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {52, 9}, - {83, 7}, {160, 5}, {6, 8}, {44, 9}, {28, 9}, {130, 8}, {71, 7}, - {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, - {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, - {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, - {17, 8}, {118, 6}, {9, 8}, {50, 9}, {82, 6}, {160, 5}, {5, 8}, - {42, 9}, {26, 9}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, - {3, 8}, {38, 9}, {22, 9}, {122, 8}, {14, 9}, {60, 10}, {86, 8}, - {161, 6}, {66, 6}, {98, 8}, {80, 8}, {139, 9}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, - {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {133, 9}, - {72, 8}, {115, 9}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, - {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, - {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, - {32, 7}, {16, 7}, {118, 6}, {8, 7}, {49, 9}, {82, 6}, {160, 5}, - {4, 7}, {41, 9}, {25, 9}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, - {165, 6}, {2, 7}, {37, 9}, {21, 9}, {121, 7}, {13, 9}, {58, 10}, - {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {137, 9}, {145, 2}, - {153, 6}, {149, 6}, {0, 0}, {1, 7}, {35, 9}, {19, 9}, {119, 7}, - {11, 9}, {54, 10}, {83, 7}, {160, 5}, {7, 9}, {46, 10}, {30, 10}, - {131, 9}, {71, 7}, {113, 9}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, - {75, 7}, {125, 9}, {69, 7}, {107, 9}, {89, 9}, {162, 7}, {145, 2}, - {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, - {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, - {160, 5}, {65, 5}, {94, 6}, {76, 6}, {129, 9}, {70, 6}, {111, 9}, - {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, {123, 9}, {68, 6}, - {105, 9}, {87, 9}, {161, 6}, {66, 6}, {99, 9}, {81, 9}, {142, 10}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, - {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, - {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, {168, 9}, {64, 4}, - {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {164, 9}, - {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, - {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, - {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, - {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, - {12, 8}, {57, 10}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, - {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, - {18, 8}, {119, 7}, {10, 8}, {53, 10}, {83, 7}, {160, 5}, {6, 8}, - {45, 10}, {29, 10}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, - {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, - {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, - {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, - {51, 10}, {82, 6}, {160, 5}, {5, 8}, {43, 10}, {27, 10}, {128, 8}, - {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, {39, 10}, {23, 10}, - {122, 8}, {15, 10}, {62, 11}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, - {80, 8}, {140, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, - {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, - {65, 5}, {96, 8}, {78, 8}, {134, 10}, {72, 8}, {116, 10}, {148, 5}, - {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, - {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, - {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, - {8, 7}, {100, 6}, {82, 6}, {160, 5}, {4, 7}, {94, 6}, {76, 6}, - {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {92, 6}, - {74, 6}, {121, 7}, {68, 6}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, - {97, 7}, {79, 7}, {138, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, - {1, 7}, {91, 5}, {73, 5}, {119, 7}, {67, 5}, {101, 7}, {83, 7}, - {160, 5}, {65, 5}, {95, 7}, {77, 7}, {132, 10}, {71, 7}, {114, 10}, - {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {126, 10}, {69, 7}, - {108, 10}, {90, 10}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, - {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, - {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, - {76, 6}, {156, 5}, {70, 6}, {152, 5}, {148, 5}, {165, 6}, {64, 4}, - {92, 6}, {74, 6}, {155, 4}, {68, 6}, {151, 4}, {147, 4}, {161, 6}, - {66, 6}, {150, 3}, {146, 3}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, - {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, - {152, 5}, {148, 5}, {169, 10}, {0, 4}, {0, 3}, {0, 3}, {0, 4}, - {0, 2}, {0, 4}, {0, 4}, {0, 0}, {0, 2}, {0, 3}, {0, 3}, - {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, - {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, - {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, - {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, {56, 9}, {85, 7}, - {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, - {52, 9}, {83, 7}, {160, 5}, {6, 8}, {44, 9}, {28, 9}, {130, 8}, - {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, - {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, - {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, - {33, 8}, {17, 8}, {118, 6}, {9, 8}, {50, 9}, {82, 6}, {160, 5}, - {5, 8}, {42, 9}, {26, 9}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, - {165, 6}, {3, 8}, {38, 9}, {22, 9}, {122, 8}, {14, 9}, {61, 11}, - {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, {139, 9}, {145, 2}, - {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, - {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, - {133, 9}, {72, 8}, {115, 9}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, - {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, - {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, - {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {49, 9}, {82, 6}, - {160, 5}, {4, 7}, {41, 9}, {25, 9}, {127, 7}, {70, 6}, {109, 7}, - {148, 5}, {165, 6}, {2, 7}, {37, 9}, {21, 9}, {121, 7}, {13, 9}, - {59, 11}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {137, 9}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {35, 9}, {19, 9}, - {119, 7}, {11, 9}, {55, 11}, {83, 7}, {160, 5}, {7, 9}, {47, 11}, - {31, 11}, {131, 9}, {71, 7}, {113, 9}, {148, 5}, {166, 7}, {64, 4}, - {93, 7}, {75, 7}, {125, 9}, {69, 7}, {107, 9}, {89, 9}, {162, 7}, - {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, - {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, - {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, {129, 9}, {70, 6}, - {111, 9}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, {123, 9}, - {68, 6}, {105, 9}, {87, 9}, {161, 6}, {66, 6}, {99, 9}, {81, 9}, - {143, 11}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, - {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, - {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, {168, 9}, - {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, - {164, 9}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, - {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, - {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, - {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, - {121, 7}, {12, 8}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, - {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, - {34, 8}, {18, 8}, {119, 7}, {10, 8}, {101, 7}, {83, 7}, {160, 5}, - {6, 8}, {95, 7}, {77, 7}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, - {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, - {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, - {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, - {9, 8}, {100, 6}, {82, 6}, {160, 5}, {5, 8}, {94, 6}, {76, 6}, - {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, {92, 6}, - {74, 6}, {122, 8}, {68, 6}, {104, 8}, {86, 8}, {161, 6}, {66, 6}, - {98, 8}, {80, 8}, {141, 11}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, - {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, - {160, 5}, {65, 5}, {96, 8}, {78, 8}, {135, 11}, {72, 8}, {117, 11}, - {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, - {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, - {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, - {118, 6}, {8, 7}, {100, 6}, {82, 6}, {160, 5}, {4, 7}, {94, 6}, - {76, 6}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, - {92, 6}, {74, 6}, {121, 7}, {68, 6}, {103, 7}, {85, 7}, {161, 6}, - {66, 6}, {97, 7}, {79, 7}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {1, 7}, {91, 5}, {73, 5}, {119, 7}, {67, 5}, {101, 7}, - {83, 7}, {160, 5}, {65, 5}, {95, 7}, {77, 7}, {156, 5}, {71, 7}, - {152, 5}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {155, 4}, - {69, 7}, {151, 4}, {147, 4}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, - {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, - {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, - {94, 6}, {76, 6}, {156, 5}, {70, 6}, {152, 5}, {148, 5}, {165, 6}, - {64, 4}, {92, 6}, {74, 6}, {155, 4}, {68, 6}, {151, 4}, {147, 4}, - {161, 6}, {66, 6}, {150, 3}, {146, 3}, {157, 6}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {0, 4}, {0, 5}, {0, 5}, {0, 4}, {0, 5}, - {0, 4}, {0, 4}, {0, 5}, {0, 5}, {0, 3}, {0, 3}, {0, 5}, - {0, 2}, {0, 5}, {0, 5}, {0, 0}, {0, 4}, {0, 3}, {0, 3}, - {0, 4}, {0, 2}, {0, 4}, {0, 4}, {0, 0}, {0, 2}, {0, 3}, - {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, - {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, - {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, - {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, {56, 9}, - {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, - {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, - {10, 8}, {52, 9}, {83, 7}, {160, 5}, {6, 8}, {44, 9}, {28, 9}, - {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, - {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, - {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, - {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {50, 9}, {82, 6}, - {160, 5}, {5, 8}, {42, 9}, {26, 9}, {128, 8}, {70, 6}, {110, 8}, - {148, 5}, {165, 6}, {3, 8}, {38, 9}, {22, 9}, {122, 8}, {14, 9}, - {60, 10}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, {139, 9}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, - {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, - {78, 8}, {133, 9}, {72, 8}, {115, 9}, {148, 5}, {167, 8}, {64, 4}, - {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, - {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, - {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {49, 9}, - {82, 6}, {160, 5}, {4, 7}, {41, 9}, {25, 9}, {127, 7}, {70, 6}, - {109, 7}, {148, 5}, {165, 6}, {2, 7}, {37, 9}, {21, 9}, {121, 7}, - {13, 9}, {58, 10}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, - {137, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {35, 9}, - {19, 9}, {119, 7}, {11, 9}, {54, 10}, {83, 7}, {160, 5}, {7, 9}, - {46, 10}, {30, 10}, {131, 9}, {71, 7}, {113, 9}, {148, 5}, {166, 7}, - {64, 4}, {93, 7}, {75, 7}, {125, 9}, {69, 7}, {107, 9}, {89, 9}, - {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, - {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, - {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, {129, 9}, - {70, 6}, {111, 9}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, - {123, 9}, {68, 6}, {105, 9}, {87, 9}, {161, 6}, {66, 6}, {99, 9}, - {81, 9}, {142, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, - {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, - {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, - {168, 9}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, - {147, 4}, {164, 9}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, - {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, - {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, - {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, - {20, 8}, {121, 7}, {12, 8}, {57, 10}, {85, 7}, {161, 6}, {66, 6}, - {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, - {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {53, 10}, {83, 7}, - {160, 5}, {6, 8}, {45, 10}, {29, 10}, {130, 8}, {71, 7}, {112, 8}, - {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, - {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, - {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, - {118, 6}, {9, 8}, {51, 10}, {82, 6}, {160, 5}, {5, 8}, {43, 10}, - {27, 10}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, - {39, 10}, {23, 10}, {122, 8}, {15, 10}, {104, 8}, {86, 8}, {161, 6}, - {66, 6}, {98, 8}, {80, 8}, {140, 10}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, - {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {134, 10}, {72, 8}, - {116, 10}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, - {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, - {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, - {16, 7}, {118, 6}, {8, 7}, {100, 6}, {82, 6}, {160, 5}, {4, 7}, - {94, 6}, {76, 6}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, - {2, 7}, {92, 6}, {74, 6}, {121, 7}, {68, 6}, {103, 7}, {85, 7}, - {161, 6}, {66, 6}, {97, 7}, {79, 7}, {138, 10}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {1, 7}, {91, 5}, {73, 5}, {119, 7}, {67, 5}, - {101, 7}, {83, 7}, {160, 5}, {65, 5}, {95, 7}, {77, 7}, {132, 10}, - {71, 7}, {114, 10}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, - {126, 10}, {69, 7}, {108, 10}, {90, 10}, {162, 7}, {145, 2}, {150, 3}, - {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, - {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, - {65, 5}, {94, 6}, {76, 6}, {156, 5}, {70, 6}, {152, 5}, {148, 5}, - {165, 6}, {64, 4}, {92, 6}, {74, 6}, {155, 4}, {68, 6}, {151, 4}, - {147, 4}, {161, 6}, {66, 6}, {150, 3}, {146, 3}, {157, 6}, {145, 2}, - {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, - {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, - {156, 5}, {145, 2}, {152, 5}, {148, 5}, {169, 10}, {0, 4}, {0, 3}, - {0, 3}, {0, 4}, {0, 2}, {0, 4}, {0, 4}, {0, 0}, {0, 2}, - {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, - {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, - {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, - {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, - {56, 9}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, - {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, - {119, 7}, {10, 8}, {52, 9}, {83, 7}, {160, 5}, {6, 8}, {44, 9}, - {28, 9}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, - {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, - {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, - {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {50, 9}, - {82, 6}, {160, 5}, {5, 8}, {42, 9}, {26, 9}, {128, 8}, {70, 6}, - {110, 8}, {148, 5}, {165, 6}, {3, 8}, {38, 9}, {22, 9}, {122, 8}, - {14, 9}, {104, 8}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, - {139, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, - {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, - {96, 8}, {78, 8}, {133, 9}, {72, 8}, {115, 9}, {148, 5}, {167, 8}, - {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, - {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, - {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, - {49, 9}, {82, 6}, {160, 5}, {4, 7}, {41, 9}, {25, 9}, {127, 7}, - {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {37, 9}, {21, 9}, - {121, 7}, {13, 9}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, - {79, 7}, {137, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, - {35, 9}, {19, 9}, {119, 7}, {11, 9}, {101, 7}, {83, 7}, {160, 5}, - {7, 9}, {95, 7}, {77, 7}, {131, 9}, {71, 7}, {113, 9}, {148, 5}, - {166, 7}, {64, 4}, {93, 7}, {75, 7}, {125, 9}, {69, 7}, {107, 9}, - {89, 9}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, - {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, - {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, - {129, 9}, {70, 6}, {111, 9}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, - {74, 6}, {123, 9}, {68, 6}, {105, 9}, {87, 9}, {161, 6}, {66, 6}, - {99, 9}, {81, 9}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, - {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, - {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, - {148, 5}, {168, 9}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, - {151, 4}, {147, 4}, {164, 9}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, - {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, - {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, - {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, - {36, 8}, {20, 8}, {121, 7}, {12, 8}, {103, 7}, {85, 7}, {161, 6}, - {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, - {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {101, 7}, - {83, 7}, {160, 5}, {6, 8}, {95, 7}, {77, 7}, {130, 8}, {71, 7}, - {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, - {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, - {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, - {17, 8}, {118, 6}, {9, 8}, {100, 6}, {82, 6}, {160, 5}, {5, 8}, - {94, 6}, {76, 6}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, - {3, 8}, {92, 6}, {74, 6}, {122, 8}, {68, 6}, {104, 8}, {86, 8}, - {161, 6}, {66, 6}, {98, 8}, {80, 8}, {157, 6}, {145, 2}, {153, 6}, - {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, - {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {156, 5}, - {72, 8}, {152, 5}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, - {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, - {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, - {0, 7}, {0, 7}, {0, 6}, {0, 7}, {0, 6}, {0, 6}, {0, 5}, - {0, 7}, {0, 6}, {0, 6}, {0, 7}, {0, 6}, {0, 7}, {0, 5}, - {0, 6}, {0, 7}, {0, 6}, {0, 6}, {0, 7}, {0, 6}, {0, 7}, - {0, 7}, {0, 6}, {0, 6}, {0, 7}, {0, 7}, {0, 6}, {0, 2}, - {0, 6}, {0, 6}, {0, 0}, {0, 7}, {0, 5}, {0, 5}, {0, 7}, - {0, 5}, {0, 7}, {0, 7}, {0, 5}, {0, 5}, {0, 7}, {0, 7}, - {0, 5}, {0, 7}, {0, 5}, {0, 5}, {0, 7}, {0, 4}, {0, 7}, - {0, 7}, {0, 4}, {0, 7}, {0, 4}, {0, 4}, {0, 7}, {0, 2}, - {0, 3}, {0, 3}, {0, 7}, {0, 2}, {0, 7}, {0, 0}, {0, 0}, - {0, 6}, {0, 5}, {0, 5}, {0, 6}, {0, 5}, {0, 6}, {0, 6}, - {0, 5}, {0, 5}, {0, 6}, {0, 6}, {0, 5}, {0, 6}, {0, 5}, - {0, 5}, {0, 6}, {0, 4}, {0, 6}, {0, 6}, {0, 4}, {0, 6}, - {0, 4}, {0, 4}, {0, 6}, {0, 6}, {0, 3}, {0, 3}, {0, 6}, - {0, 2}, {0, 6}, {0, 6}, {0, 0}, {0, 4}, {0, 5}, {0, 5}, - {0, 4}, {0, 5}, {0, 4}, {0, 4}, {0, 5}, {0, 5}, {0, 3}, - {0, 3}, {0, 5}, {0, 2}, {0, 5}, {0, 5}, {0, 0}, {0, 4}, - {0, 3}, {0, 3}, {0, 4}, {0, 2}, {0, 4}, {0, 4}, {0, 0}, - {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, - {0, 0} -}; - +static SIMDCOMP_ALIGNED(0x1000) index_bytes_consumed combined_lookup[] = { + {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, + {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, + {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, + {56, 9}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, + {119, 7}, {10, 8}, {52, 9}, {83, 7}, {160, 5}, {6, 8}, {44, 9}, + {28, 9}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, + {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, + {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, + {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {50, 9}, + {82, 6}, {160, 5}, {5, 8}, {42, 9}, {26, 9}, {128, 8}, {70, 6}, + {110, 8}, {148, 5}, {165, 6}, {3, 8}, {38, 9}, {22, 9}, {122, 8}, + {14, 9}, {60, 10}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, + {139, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, + {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, + {96, 8}, {78, 8}, {133, 9}, {72, 8}, {115, 9}, {148, 5}, {167, 8}, + {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, + {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, + {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, + {49, 9}, {82, 6}, {160, 5}, {4, 7}, {41, 9}, {25, 9}, {127, 7}, + {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {37, 9}, {21, 9}, + {121, 7}, {13, 9}, {58, 10}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, + {79, 7}, {137, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, + {35, 9}, {19, 9}, {119, 7}, {11, 9}, {54, 10}, {83, 7}, {160, 5}, + {7, 9}, {46, 10}, {30, 10}, {131, 9}, {71, 7}, {113, 9}, {148, 5}, + {166, 7}, {64, 4}, {93, 7}, {75, 7}, {125, 9}, {69, 7}, {107, 9}, + {89, 9}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, + {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, + {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, + {129, 9}, {70, 6}, {111, 9}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, + {74, 6}, {123, 9}, {68, 6}, {105, 9}, {87, 9}, {161, 6}, {66, 6}, + {99, 9}, {81, 9}, {142, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, + {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, + {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, + {148, 5}, {168, 9}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, + {151, 4}, {147, 4}, {164, 9}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, + {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, + {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, + {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, + {36, 8}, {20, 8}, {121, 7}, {12, 8}, {57, 10}, {85, 7}, {161, 6}, + {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {53, 10}, + {83, 7}, {160, 5}, {6, 8}, {45, 10}, {29, 10}, {130, 8}, {71, 7}, + {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, + {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, + {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, + {17, 8}, {118, 6}, {9, 8}, {51, 10}, {82, 6}, {160, 5}, {5, 8}, + {43, 10}, {27, 10}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, + {3, 8}, {39, 10}, {23, 10}, {122, 8}, {15, 10}, {62, 11}, {86, 8}, + {161, 6}, {66, 6}, {98, 8}, {80, 8}, {140, 10}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, + {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {134, 10}, + {72, 8}, {116, 10}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, + {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, + {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, + {32, 7}, {16, 7}, {118, 6}, {8, 7}, {100, 6}, {82, 6}, {160, 5}, + {4, 7}, {94, 6}, {76, 6}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, + {165, 6}, {2, 7}, {92, 6}, {74, 6}, {121, 7}, {68, 6}, {103, 7}, + {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {138, 10}, {145, 2}, + {153, 6}, {149, 6}, {0, 0}, {1, 7}, {91, 5}, {73, 5}, {119, 7}, + {67, 5}, {101, 7}, {83, 7}, {160, 5}, {65, 5}, {95, 7}, {77, 7}, + {132, 10}, {71, 7}, {114, 10}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, + {75, 7}, {126, 10}, {69, 7}, {108, 10}, {90, 10}, {162, 7}, {145, 2}, + {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, + {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, + {160, 5}, {65, 5}, {94, 6}, {76, 6}, {156, 5}, {70, 6}, {152, 5}, + {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, {155, 4}, {68, 6}, + {151, 4}, {147, 4}, {161, 6}, {66, 6}, {150, 3}, {146, 3}, {157, 6}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, + {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, + {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, {169, 10}, {0, 4}, + {0, 3}, {0, 3}, {0, 4}, {0, 2}, {0, 4}, {0, 4}, {0, 0}, + {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, + {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, + {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, + {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, + {12, 8}, {56, 9}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, + {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, + {18, 8}, {119, 7}, {10, 8}, {52, 9}, {83, 7}, {160, 5}, {6, 8}, + {44, 9}, {28, 9}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, + {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, + {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, + {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, + {50, 9}, {82, 6}, {160, 5}, {5, 8}, {42, 9}, {26, 9}, {128, 8}, + {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, {38, 9}, {22, 9}, + {122, 8}, {14, 9}, {61, 11}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, + {80, 8}, {139, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, + {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, + {65, 5}, {96, 8}, {78, 8}, {133, 9}, {72, 8}, {115, 9}, {148, 5}, + {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, + {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, + {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, + {8, 7}, {49, 9}, {82, 6}, {160, 5}, {4, 7}, {41, 9}, {25, 9}, + {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {37, 9}, + {21, 9}, {121, 7}, {13, 9}, {59, 11}, {85, 7}, {161, 6}, {66, 6}, + {97, 7}, {79, 7}, {137, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, + {1, 7}, {35, 9}, {19, 9}, {119, 7}, {11, 9}, {55, 11}, {83, 7}, + {160, 5}, {7, 9}, {47, 11}, {31, 11}, {131, 9}, {71, 7}, {113, 9}, + {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {125, 9}, {69, 7}, + {107, 9}, {89, 9}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, + {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, + {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, + {76, 6}, {129, 9}, {70, 6}, {111, 9}, {148, 5}, {165, 6}, {64, 4}, + {92, 6}, {74, 6}, {123, 9}, {68, 6}, {105, 9}, {87, 9}, {161, 6}, + {66, 6}, {99, 9}, {81, 9}, {143, 11}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, + {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, + {152, 5}, {148, 5}, {168, 9}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, + {145, 2}, {151, 4}, {147, 4}, {164, 9}, {0, 2}, {0, 3}, {0, 3}, + {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, + {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, + {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, + {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, {103, 7}, {85, 7}, + {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, + {101, 7}, {83, 7}, {160, 5}, {6, 8}, {95, 7}, {77, 7}, {130, 8}, + {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, + {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, + {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, + {33, 8}, {17, 8}, {118, 6}, {9, 8}, {100, 6}, {82, 6}, {160, 5}, + {5, 8}, {94, 6}, {76, 6}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, + {165, 6}, {3, 8}, {92, 6}, {74, 6}, {122, 8}, {68, 6}, {104, 8}, + {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, {141, 11}, {145, 2}, + {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, + {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, + {135, 11}, {72, 8}, {117, 11}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, + {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, + {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, + {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {100, 6}, {82, 6}, + {160, 5}, {4, 7}, {94, 6}, {76, 6}, {127, 7}, {70, 6}, {109, 7}, + {148, 5}, {165, 6}, {2, 7}, {92, 6}, {74, 6}, {121, 7}, {68, 6}, + {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {157, 6}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {91, 5}, {73, 5}, + {119, 7}, {67, 5}, {101, 7}, {83, 7}, {160, 5}, {65, 5}, {95, 7}, + {77, 7}, {156, 5}, {71, 7}, {152, 5}, {148, 5}, {166, 7}, {64, 4}, + {93, 7}, {75, 7}, {155, 4}, {69, 7}, {151, 4}, {147, 4}, {162, 7}, + {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, + {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, + {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, {156, 5}, {70, 6}, + {152, 5}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, {155, 4}, + {68, 6}, {151, 4}, {147, 4}, {161, 6}, {66, 6}, {150, 3}, {146, 3}, + {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {0, 4}, {0, 5}, + {0, 5}, {0, 4}, {0, 5}, {0, 4}, {0, 4}, {0, 5}, {0, 5}, + {0, 3}, {0, 3}, {0, 5}, {0, 2}, {0, 5}, {0, 5}, {0, 0}, + {0, 4}, {0, 3}, {0, 3}, {0, 4}, {0, 2}, {0, 4}, {0, 4}, + {0, 0}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, + {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, + {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, + {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, + {121, 7}, {12, 8}, {56, 9}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, + {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, + {34, 8}, {18, 8}, {119, 7}, {10, 8}, {52, 9}, {83, 7}, {160, 5}, + {6, 8}, {44, 9}, {28, 9}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, + {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, + {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, + {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, + {9, 8}, {50, 9}, {82, 6}, {160, 5}, {5, 8}, {42, 9}, {26, 9}, + {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, {38, 9}, + {22, 9}, {122, 8}, {14, 9}, {60, 10}, {86, 8}, {161, 6}, {66, 6}, + {98, 8}, {80, 8}, {139, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, + {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, + {160, 5}, {65, 5}, {96, 8}, {78, 8}, {133, 9}, {72, 8}, {115, 9}, + {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, + {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, + {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, + {118, 6}, {8, 7}, {49, 9}, {82, 6}, {160, 5}, {4, 7}, {41, 9}, + {25, 9}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, + {37, 9}, {21, 9}, {121, 7}, {13, 9}, {58, 10}, {85, 7}, {161, 6}, + {66, 6}, {97, 7}, {79, 7}, {137, 9}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {1, 7}, {35, 9}, {19, 9}, {119, 7}, {11, 9}, {54, 10}, + {83, 7}, {160, 5}, {7, 9}, {46, 10}, {30, 10}, {131, 9}, {71, 7}, + {113, 9}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {125, 9}, + {69, 7}, {107, 9}, {89, 9}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, + {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, + {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, + {94, 6}, {76, 6}, {129, 9}, {70, 6}, {111, 9}, {148, 5}, {165, 6}, + {64, 4}, {92, 6}, {74, 6}, {123, 9}, {68, 6}, {105, 9}, {87, 9}, + {161, 6}, {66, 6}, {99, 9}, {81, 9}, {142, 10}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, + {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, + {145, 2}, {152, 5}, {148, 5}, {168, 9}, {64, 4}, {150, 3}, {146, 3}, + {155, 4}, {145, 2}, {151, 4}, {147, 4}, {164, 9}, {0, 2}, {0, 3}, + {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, + {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, + {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, + {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, {57, 10}, + {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, + {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, + {10, 8}, {53, 10}, {83, 7}, {160, 5}, {6, 8}, {45, 10}, {29, 10}, + {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, + {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, + {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, + {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {51, 10}, {82, 6}, + {160, 5}, {5, 8}, {43, 10}, {27, 10}, {128, 8}, {70, 6}, {110, 8}, + {148, 5}, {165, 6}, {3, 8}, {39, 10}, {23, 10}, {122, 8}, {15, 10}, + {63, 12}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, {140, 10}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, + {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, + {78, 8}, {134, 10}, {72, 8}, {116, 10}, {148, 5}, {167, 8}, {64, 4}, + {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, + {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, + {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {100, 6}, + {82, 6}, {160, 5}, {4, 7}, {94, 6}, {76, 6}, {127, 7}, {70, 6}, + {109, 7}, {148, 5}, {165, 6}, {2, 7}, {92, 6}, {74, 6}, {121, 7}, + {68, 6}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, + {138, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {91, 5}, + {73, 5}, {119, 7}, {67, 5}, {101, 7}, {83, 7}, {160, 5}, {65, 5}, + {95, 7}, {77, 7}, {132, 10}, {71, 7}, {114, 10}, {148, 5}, {166, 7}, + {64, 4}, {93, 7}, {75, 7}, {126, 10}, {69, 7}, {108, 10}, {90, 10}, + {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, + {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, + {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, {156, 5}, + {70, 6}, {152, 5}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, + {155, 4}, {68, 6}, {151, 4}, {147, 4}, {161, 6}, {66, 6}, {150, 3}, + {146, 3}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, + {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, + {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, + {169, 10}, {0, 4}, {0, 3}, {0, 3}, {0, 4}, {0, 2}, {0, 4}, + {0, 4}, {0, 0}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, + {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, + {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, + {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, + {20, 8}, {121, 7}, {12, 8}, {56, 9}, {85, 7}, {161, 6}, {66, 6}, + {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, + {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {52, 9}, {83, 7}, + {160, 5}, {6, 8}, {44, 9}, {28, 9}, {130, 8}, {71, 7}, {112, 8}, + {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, + {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, + {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, + {118, 6}, {9, 8}, {50, 9}, {82, 6}, {160, 5}, {5, 8}, {42, 9}, + {26, 9}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, + {38, 9}, {22, 9}, {122, 8}, {14, 9}, {104, 8}, {86, 8}, {161, 6}, + {66, 6}, {98, 8}, {80, 8}, {139, 9}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, + {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {133, 9}, {72, 8}, + {115, 9}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, + {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, + {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, + {16, 7}, {118, 6}, {8, 7}, {49, 9}, {82, 6}, {160, 5}, {4, 7}, + {41, 9}, {25, 9}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, + {2, 7}, {37, 9}, {21, 9}, {121, 7}, {13, 9}, {103, 7}, {85, 7}, + {161, 6}, {66, 6}, {97, 7}, {79, 7}, {137, 9}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {1, 7}, {35, 9}, {19, 9}, {119, 7}, {11, 9}, + {101, 7}, {83, 7}, {160, 5}, {7, 9}, {95, 7}, {77, 7}, {131, 9}, + {71, 7}, {113, 9}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, + {125, 9}, {69, 7}, {107, 9}, {89, 9}, {162, 7}, {145, 2}, {150, 3}, + {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, + {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, + {65, 5}, {94, 6}, {76, 6}, {129, 9}, {70, 6}, {111, 9}, {148, 5}, + {165, 6}, {64, 4}, {92, 6}, {74, 6}, {123, 9}, {68, 6}, {105, 9}, + {87, 9}, {161, 6}, {66, 6}, {99, 9}, {81, 9}, {144, 12}, {145, 2}, + {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, + {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, + {156, 5}, {145, 2}, {152, 5}, {148, 5}, {168, 9}, {64, 4}, {150, 3}, + {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {164, 9}, {0, 2}, + {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, + {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, + {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, + {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, + {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, + {119, 7}, {10, 8}, {101, 7}, {83, 7}, {160, 5}, {6, 8}, {95, 7}, + {77, 7}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, + {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, + {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, + {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {100, 6}, + {82, 6}, {160, 5}, {5, 8}, {94, 6}, {76, 6}, {128, 8}, {70, 6}, + {110, 8}, {148, 5}, {165, 6}, {3, 8}, {92, 6}, {74, 6}, {122, 8}, + {68, 6}, {104, 8}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, + {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, + {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, + {96, 8}, {78, 8}, {156, 5}, {72, 8}, {152, 5}, {148, 5}, {167, 8}, + {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, + {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, + {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, + {100, 6}, {82, 6}, {160, 5}, {4, 7}, {94, 6}, {76, 6}, {127, 7}, + {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {92, 6}, {74, 6}, + {121, 7}, {68, 6}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, + {79, 7}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, + {91, 5}, {73, 5}, {119, 7}, {67, 5}, {101, 7}, {83, 7}, {160, 5}, + {65, 5}, {95, 7}, {77, 7}, {156, 5}, {71, 7}, {152, 5}, {148, 5}, + {166, 7}, {64, 4}, {93, 7}, {75, 7}, {155, 4}, {69, 7}, {151, 4}, + {147, 4}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, + {154, 7}, {0, 0}, {0, 0}, {0, 6}, {0, 5}, {0, 5}, {0, 6}, + {0, 5}, {0, 6}, {0, 6}, {0, 5}, {0, 5}, {0, 6}, {0, 6}, + {0, 5}, {0, 6}, {0, 5}, {0, 5}, {0, 6}, {0, 4}, {0, 6}, + {0, 6}, {0, 4}, {0, 6}, {0, 4}, {0, 4}, {0, 6}, {0, 6}, + {0, 3}, {0, 3}, {0, 6}, {0, 2}, {0, 6}, {0, 6}, {0, 0}, + {0, 4}, {0, 5}, {0, 5}, {0, 4}, {0, 5}, {0, 4}, {0, 4}, + {0, 5}, {0, 5}, {0, 3}, {0, 3}, {0, 5}, {0, 2}, {0, 5}, + {0, 5}, {0, 0}, {0, 4}, {0, 3}, {0, 3}, {0, 4}, {0, 2}, + {0, 4}, {0, 4}, {0, 0}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, + {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, + {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, + {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, + {36, 8}, {20, 8}, {121, 7}, {12, 8}, {56, 9}, {85, 7}, {161, 6}, + {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {52, 9}, + {83, 7}, {160, 5}, {6, 8}, {44, 9}, {28, 9}, {130, 8}, {71, 7}, + {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, + {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, + {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, + {17, 8}, {118, 6}, {9, 8}, {50, 9}, {82, 6}, {160, 5}, {5, 8}, + {42, 9}, {26, 9}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, + {3, 8}, {38, 9}, {22, 9}, {122, 8}, {14, 9}, {60, 10}, {86, 8}, + {161, 6}, {66, 6}, {98, 8}, {80, 8}, {139, 9}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, + {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {133, 9}, + {72, 8}, {115, 9}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, + {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, + {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, + {32, 7}, {16, 7}, {118, 6}, {8, 7}, {49, 9}, {82, 6}, {160, 5}, + {4, 7}, {41, 9}, {25, 9}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, + {165, 6}, {2, 7}, {37, 9}, {21, 9}, {121, 7}, {13, 9}, {58, 10}, + {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {137, 9}, {145, 2}, + {153, 6}, {149, 6}, {0, 0}, {1, 7}, {35, 9}, {19, 9}, {119, 7}, + {11, 9}, {54, 10}, {83, 7}, {160, 5}, {7, 9}, {46, 10}, {30, 10}, + {131, 9}, {71, 7}, {113, 9}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, + {75, 7}, {125, 9}, {69, 7}, {107, 9}, {89, 9}, {162, 7}, {145, 2}, + {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, + {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, + {160, 5}, {65, 5}, {94, 6}, {76, 6}, {129, 9}, {70, 6}, {111, 9}, + {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, {123, 9}, {68, 6}, + {105, 9}, {87, 9}, {161, 6}, {66, 6}, {99, 9}, {81, 9}, {142, 10}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, + {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, + {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, {168, 9}, {64, 4}, + {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {164, 9}, + {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, + {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, + {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, + {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, + {12, 8}, {57, 10}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, + {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, + {18, 8}, {119, 7}, {10, 8}, {53, 10}, {83, 7}, {160, 5}, {6, 8}, + {45, 10}, {29, 10}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, + {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, + {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, + {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, + {51, 10}, {82, 6}, {160, 5}, {5, 8}, {43, 10}, {27, 10}, {128, 8}, + {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, {39, 10}, {23, 10}, + {122, 8}, {15, 10}, {62, 11}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, + {80, 8}, {140, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, + {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, + {65, 5}, {96, 8}, {78, 8}, {134, 10}, {72, 8}, {116, 10}, {148, 5}, + {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, + {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, + {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, + {8, 7}, {100, 6}, {82, 6}, {160, 5}, {4, 7}, {94, 6}, {76, 6}, + {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {92, 6}, + {74, 6}, {121, 7}, {68, 6}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, + {97, 7}, {79, 7}, {138, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, + {1, 7}, {91, 5}, {73, 5}, {119, 7}, {67, 5}, {101, 7}, {83, 7}, + {160, 5}, {65, 5}, {95, 7}, {77, 7}, {132, 10}, {71, 7}, {114, 10}, + {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {126, 10}, {69, 7}, + {108, 10}, {90, 10}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, + {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, + {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, + {76, 6}, {156, 5}, {70, 6}, {152, 5}, {148, 5}, {165, 6}, {64, 4}, + {92, 6}, {74, 6}, {155, 4}, {68, 6}, {151, 4}, {147, 4}, {161, 6}, + {66, 6}, {150, 3}, {146, 3}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, + {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, + {152, 5}, {148, 5}, {169, 10}, {0, 4}, {0, 3}, {0, 3}, {0, 4}, + {0, 2}, {0, 4}, {0, 4}, {0, 0}, {0, 2}, {0, 3}, {0, 3}, + {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, + {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, + {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, + {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, {56, 9}, {85, 7}, + {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, + {52, 9}, {83, 7}, {160, 5}, {6, 8}, {44, 9}, {28, 9}, {130, 8}, + {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, + {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, + {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, + {33, 8}, {17, 8}, {118, 6}, {9, 8}, {50, 9}, {82, 6}, {160, 5}, + {5, 8}, {42, 9}, {26, 9}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, + {165, 6}, {3, 8}, {38, 9}, {22, 9}, {122, 8}, {14, 9}, {61, 11}, + {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, {139, 9}, {145, 2}, + {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, + {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, + {133, 9}, {72, 8}, {115, 9}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, + {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, + {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, + {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {49, 9}, {82, 6}, + {160, 5}, {4, 7}, {41, 9}, {25, 9}, {127, 7}, {70, 6}, {109, 7}, + {148, 5}, {165, 6}, {2, 7}, {37, 9}, {21, 9}, {121, 7}, {13, 9}, + {59, 11}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {137, 9}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {35, 9}, {19, 9}, + {119, 7}, {11, 9}, {55, 11}, {83, 7}, {160, 5}, {7, 9}, {47, 11}, + {31, 11}, {131, 9}, {71, 7}, {113, 9}, {148, 5}, {166, 7}, {64, 4}, + {93, 7}, {75, 7}, {125, 9}, {69, 7}, {107, 9}, {89, 9}, {162, 7}, + {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, + {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, + {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, {129, 9}, {70, 6}, + {111, 9}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, {123, 9}, + {68, 6}, {105, 9}, {87, 9}, {161, 6}, {66, 6}, {99, 9}, {81, 9}, + {143, 11}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, + {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, + {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, {168, 9}, + {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, + {164, 9}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, + {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, + {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, + {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, + {121, 7}, {12, 8}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, + {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, + {34, 8}, {18, 8}, {119, 7}, {10, 8}, {101, 7}, {83, 7}, {160, 5}, + {6, 8}, {95, 7}, {77, 7}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, + {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, + {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, + {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, + {9, 8}, {100, 6}, {82, 6}, {160, 5}, {5, 8}, {94, 6}, {76, 6}, + {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, {92, 6}, + {74, 6}, {122, 8}, {68, 6}, {104, 8}, {86, 8}, {161, 6}, {66, 6}, + {98, 8}, {80, 8}, {141, 11}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, + {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, + {160, 5}, {65, 5}, {96, 8}, {78, 8}, {135, 11}, {72, 8}, {117, 11}, + {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, + {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, + {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, + {118, 6}, {8, 7}, {100, 6}, {82, 6}, {160, 5}, {4, 7}, {94, 6}, + {76, 6}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, + {92, 6}, {74, 6}, {121, 7}, {68, 6}, {103, 7}, {85, 7}, {161, 6}, + {66, 6}, {97, 7}, {79, 7}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {1, 7}, {91, 5}, {73, 5}, {119, 7}, {67, 5}, {101, 7}, + {83, 7}, {160, 5}, {65, 5}, {95, 7}, {77, 7}, {156, 5}, {71, 7}, + {152, 5}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {155, 4}, + {69, 7}, {151, 4}, {147, 4}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, + {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, + {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, + {94, 6}, {76, 6}, {156, 5}, {70, 6}, {152, 5}, {148, 5}, {165, 6}, + {64, 4}, {92, 6}, {74, 6}, {155, 4}, {68, 6}, {151, 4}, {147, 4}, + {161, 6}, {66, 6}, {150, 3}, {146, 3}, {157, 6}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {0, 4}, {0, 5}, {0, 5}, {0, 4}, {0, 5}, + {0, 4}, {0, 4}, {0, 5}, {0, 5}, {0, 3}, {0, 3}, {0, 5}, + {0, 2}, {0, 5}, {0, 5}, {0, 0}, {0, 4}, {0, 3}, {0, 3}, + {0, 4}, {0, 2}, {0, 4}, {0, 4}, {0, 0}, {0, 2}, {0, 3}, + {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, + {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, + {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, + {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, {56, 9}, + {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, + {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, + {10, 8}, {52, 9}, {83, 7}, {160, 5}, {6, 8}, {44, 9}, {28, 9}, + {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, + {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, + {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, + {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {50, 9}, {82, 6}, + {160, 5}, {5, 8}, {42, 9}, {26, 9}, {128, 8}, {70, 6}, {110, 8}, + {148, 5}, {165, 6}, {3, 8}, {38, 9}, {22, 9}, {122, 8}, {14, 9}, + {60, 10}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, {139, 9}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, + {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, + {78, 8}, {133, 9}, {72, 8}, {115, 9}, {148, 5}, {167, 8}, {64, 4}, + {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, + {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, + {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {49, 9}, + {82, 6}, {160, 5}, {4, 7}, {41, 9}, {25, 9}, {127, 7}, {70, 6}, + {109, 7}, {148, 5}, {165, 6}, {2, 7}, {37, 9}, {21, 9}, {121, 7}, + {13, 9}, {58, 10}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, + {137, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {35, 9}, + {19, 9}, {119, 7}, {11, 9}, {54, 10}, {83, 7}, {160, 5}, {7, 9}, + {46, 10}, {30, 10}, {131, 9}, {71, 7}, {113, 9}, {148, 5}, {166, 7}, + {64, 4}, {93, 7}, {75, 7}, {125, 9}, {69, 7}, {107, 9}, {89, 9}, + {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, + {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, {67, 5}, + {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, {129, 9}, + {70, 6}, {111, 9}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, {74, 6}, + {123, 9}, {68, 6}, {105, 9}, {87, 9}, {161, 6}, {66, 6}, {99, 9}, + {81, 9}, {142, 10}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, + {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, {160, 5}, + {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, {148, 5}, + {168, 9}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, + {147, 4}, {164, 9}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, + {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, + {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, {24, 8}, + {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {36, 8}, + {20, 8}, {121, 7}, {12, 8}, {57, 10}, {85, 7}, {161, 6}, {66, 6}, + {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, + {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {53, 10}, {83, 7}, + {160, 5}, {6, 8}, {45, 10}, {29, 10}, {130, 8}, {71, 7}, {112, 8}, + {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, {69, 7}, + {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, + {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, {17, 8}, + {118, 6}, {9, 8}, {51, 10}, {82, 6}, {160, 5}, {5, 8}, {43, 10}, + {27, 10}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, {3, 8}, + {39, 10}, {23, 10}, {122, 8}, {15, 10}, {104, 8}, {86, 8}, {161, 6}, + {66, 6}, {98, 8}, {80, 8}, {140, 10}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, {102, 8}, + {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {134, 10}, {72, 8}, + {116, 10}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, + {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, {146, 3}, + {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, + {16, 7}, {118, 6}, {8, 7}, {100, 6}, {82, 6}, {160, 5}, {4, 7}, + {94, 6}, {76, 6}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, + {2, 7}, {92, 6}, {74, 6}, {121, 7}, {68, 6}, {103, 7}, {85, 7}, + {161, 6}, {66, 6}, {97, 7}, {79, 7}, {138, 10}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {1, 7}, {91, 5}, {73, 5}, {119, 7}, {67, 5}, + {101, 7}, {83, 7}, {160, 5}, {65, 5}, {95, 7}, {77, 7}, {132, 10}, + {71, 7}, {114, 10}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, + {126, 10}, {69, 7}, {108, 10}, {90, 10}, {162, 7}, {145, 2}, {150, 3}, + {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, + {91, 5}, {73, 5}, {118, 6}, {67, 5}, {100, 6}, {82, 6}, {160, 5}, + {65, 5}, {94, 6}, {76, 6}, {156, 5}, {70, 6}, {152, 5}, {148, 5}, + {165, 6}, {64, 4}, {92, 6}, {74, 6}, {155, 4}, {68, 6}, {151, 4}, + {147, 4}, {161, 6}, {66, 6}, {150, 3}, {146, 3}, {157, 6}, {145, 2}, + {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {155, 4}, + {67, 5}, {151, 4}, {147, 4}, {160, 5}, {65, 5}, {150, 3}, {146, 3}, + {156, 5}, {145, 2}, {152, 5}, {148, 5}, {169, 10}, {0, 4}, {0, 3}, + {0, 3}, {0, 4}, {0, 2}, {0, 4}, {0, 4}, {0, 0}, {0, 2}, + {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, + {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, {48, 8}, {82, 6}, + {160, 5}, {4, 7}, {40, 8}, {24, 8}, {127, 7}, {70, 6}, {109, 7}, + {148, 5}, {165, 6}, {2, 7}, {36, 8}, {20, 8}, {121, 7}, {12, 8}, + {56, 9}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, {79, 7}, {136, 8}, + {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, {34, 8}, {18, 8}, + {119, 7}, {10, 8}, {52, 9}, {83, 7}, {160, 5}, {6, 8}, {44, 9}, + {28, 9}, {130, 8}, {71, 7}, {112, 8}, {148, 5}, {166, 7}, {64, 4}, + {93, 7}, {75, 7}, {124, 8}, {69, 7}, {106, 8}, {88, 8}, {162, 7}, + {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, {154, 7}, {0, 0}, + {0, 0}, {0, 6}, {33, 8}, {17, 8}, {118, 6}, {9, 8}, {50, 9}, + {82, 6}, {160, 5}, {5, 8}, {42, 9}, {26, 9}, {128, 8}, {70, 6}, + {110, 8}, {148, 5}, {165, 6}, {3, 8}, {38, 9}, {22, 9}, {122, 8}, + {14, 9}, {104, 8}, {86, 8}, {161, 6}, {66, 6}, {98, 8}, {80, 8}, + {139, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {64, 4}, {91, 5}, + {73, 5}, {120, 8}, {67, 5}, {102, 8}, {84, 8}, {160, 5}, {65, 5}, + {96, 8}, {78, 8}, {133, 9}, {72, 8}, {115, 9}, {148, 5}, {167, 8}, + {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, {151, 4}, {147, 4}, + {163, 8}, {145, 2}, {150, 3}, {146, 3}, {159, 8}, {0, 2}, {0, 0}, + {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, {118, 6}, {8, 7}, + {49, 9}, {82, 6}, {160, 5}, {4, 7}, {41, 9}, {25, 9}, {127, 7}, + {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, {37, 9}, {21, 9}, + {121, 7}, {13, 9}, {103, 7}, {85, 7}, {161, 6}, {66, 6}, {97, 7}, + {79, 7}, {137, 9}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, {1, 7}, + {35, 9}, {19, 9}, {119, 7}, {11, 9}, {101, 7}, {83, 7}, {160, 5}, + {7, 9}, {95, 7}, {77, 7}, {131, 9}, {71, 7}, {113, 9}, {148, 5}, + {166, 7}, {64, 4}, {93, 7}, {75, 7}, {125, 9}, {69, 7}, {107, 9}, + {89, 9}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, {158, 7}, {145, 2}, + {154, 7}, {0, 0}, {0, 0}, {0, 6}, {91, 5}, {73, 5}, {118, 6}, + {67, 5}, {100, 6}, {82, 6}, {160, 5}, {65, 5}, {94, 6}, {76, 6}, + {129, 9}, {70, 6}, {111, 9}, {148, 5}, {165, 6}, {64, 4}, {92, 6}, + {74, 6}, {123, 9}, {68, 6}, {105, 9}, {87, 9}, {161, 6}, {66, 6}, + {99, 9}, {81, 9}, {157, 6}, {145, 2}, {153, 6}, {149, 6}, {0, 0}, + {64, 4}, {91, 5}, {73, 5}, {155, 4}, {67, 5}, {151, 4}, {147, 4}, + {160, 5}, {65, 5}, {150, 3}, {146, 3}, {156, 5}, {145, 2}, {152, 5}, + {148, 5}, {168, 9}, {64, 4}, {150, 3}, {146, 3}, {155, 4}, {145, 2}, + {151, 4}, {147, 4}, {164, 9}, {0, 2}, {0, 3}, {0, 3}, {0, 0}, + {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, {32, 7}, {16, 7}, + {118, 6}, {8, 7}, {48, 8}, {82, 6}, {160, 5}, {4, 7}, {40, 8}, + {24, 8}, {127, 7}, {70, 6}, {109, 7}, {148, 5}, {165, 6}, {2, 7}, + {36, 8}, {20, 8}, {121, 7}, {12, 8}, {103, 7}, {85, 7}, {161, 6}, + {66, 6}, {97, 7}, {79, 7}, {136, 8}, {145, 2}, {153, 6}, {149, 6}, + {0, 0}, {1, 7}, {34, 8}, {18, 8}, {119, 7}, {10, 8}, {101, 7}, + {83, 7}, {160, 5}, {6, 8}, {95, 7}, {77, 7}, {130, 8}, {71, 7}, + {112, 8}, {148, 5}, {166, 7}, {64, 4}, {93, 7}, {75, 7}, {124, 8}, + {69, 7}, {106, 8}, {88, 8}, {162, 7}, {145, 2}, {150, 3}, {146, 3}, + {158, 7}, {145, 2}, {154, 7}, {0, 0}, {0, 0}, {0, 6}, {33, 8}, + {17, 8}, {118, 6}, {9, 8}, {100, 6}, {82, 6}, {160, 5}, {5, 8}, + {94, 6}, {76, 6}, {128, 8}, {70, 6}, {110, 8}, {148, 5}, {165, 6}, + {3, 8}, {92, 6}, {74, 6}, {122, 8}, {68, 6}, {104, 8}, {86, 8}, + {161, 6}, {66, 6}, {98, 8}, {80, 8}, {157, 6}, {145, 2}, {153, 6}, + {149, 6}, {0, 0}, {64, 4}, {91, 5}, {73, 5}, {120, 8}, {67, 5}, + {102, 8}, {84, 8}, {160, 5}, {65, 5}, {96, 8}, {78, 8}, {156, 5}, + {72, 8}, {152, 5}, {148, 5}, {167, 8}, {64, 4}, {150, 3}, {146, 3}, + {155, 4}, {145, 2}, {151, 4}, {147, 4}, {163, 8}, {145, 2}, {150, 3}, + {146, 3}, {159, 8}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 6}, + {0, 7}, {0, 7}, {0, 6}, {0, 7}, {0, 6}, {0, 6}, {0, 5}, + {0, 7}, {0, 6}, {0, 6}, {0, 7}, {0, 6}, {0, 7}, {0, 5}, + {0, 6}, {0, 7}, {0, 6}, {0, 6}, {0, 7}, {0, 6}, {0, 7}, + {0, 7}, {0, 6}, {0, 6}, {0, 7}, {0, 7}, {0, 6}, {0, 2}, + {0, 6}, {0, 6}, {0, 0}, {0, 7}, {0, 5}, {0, 5}, {0, 7}, + {0, 5}, {0, 7}, {0, 7}, {0, 5}, {0, 5}, {0, 7}, {0, 7}, + {0, 5}, {0, 7}, {0, 5}, {0, 5}, {0, 7}, {0, 4}, {0, 7}, + {0, 7}, {0, 4}, {0, 7}, {0, 4}, {0, 4}, {0, 7}, {0, 2}, + {0, 3}, {0, 3}, {0, 7}, {0, 2}, {0, 7}, {0, 0}, {0, 0}, + {0, 6}, {0, 5}, {0, 5}, {0, 6}, {0, 5}, {0, 6}, {0, 6}, + {0, 5}, {0, 5}, {0, 6}, {0, 6}, {0, 5}, {0, 6}, {0, 5}, + {0, 5}, {0, 6}, {0, 4}, {0, 6}, {0, 6}, {0, 4}, {0, 6}, + {0, 4}, {0, 4}, {0, 6}, {0, 6}, {0, 3}, {0, 3}, {0, 6}, + {0, 2}, {0, 6}, {0, 6}, {0, 0}, {0, 4}, {0, 5}, {0, 5}, + {0, 4}, {0, 5}, {0, 4}, {0, 4}, {0, 5}, {0, 5}, {0, 3}, + {0, 3}, {0, 5}, {0, 2}, {0, 5}, {0, 5}, {0, 0}, {0, 4}, + {0, 3}, {0, 3}, {0, 4}, {0, 2}, {0, 4}, {0, 4}, {0, 0}, + {0, 2}, {0, 3}, {0, 3}, {0, 0}, {0, 2}, {0, 0}, {0, 0}, + {0, 0}}; static SIMDCOMP_ALIGNED(0x1000) const int8_t vectorsrawbytes[] = { - 0, -1, 4, -1, 1, -1, 5, -1, 2, -1, -1, -1, 3, -1, -1, -1, // 0 - 0, -1, 4, -1, 1, -1, 5, 6, 2, -1, -1, -1, 3, -1, -1, -1, // 1 - 0, -1, 4, 5, 1, -1, 6, -1, 2, -1, -1, -1, 3, -1, -1, -1, // 2 - 0, -1, 4, 5, 1, -1, 6, 7, 2, -1, -1, -1, 3, -1, -1, -1, // 3 - 0, -1, 5, -1, 1, -1, 6, -1, 2, -1, -1, -1, 3, 4, -1, -1, // 4 - 0, -1, 5, -1, 1, -1, 6, 7, 2, -1, -1, -1, 3, 4, -1, -1, // 5 - 0, -1, 5, 6, 1, -1, 7, -1, 2, -1, -1, -1, 3, 4, -1, -1, // 6 - 0, -1, 5, 6, 1, -1, 7, 8, 2, -1, -1, -1, 3, 4, -1, -1, // 7 - 0, -1, 5, -1, 1, -1, 6, -1, 2, 3, -1, -1, 4, -1, -1, -1, // 8 - 0, -1, 5, -1, 1, -1, 6, 7, 2, 3, -1, -1, 4, -1, -1, -1, // 9 - 0, -1, 5, 6, 1, -1, 7, -1, 2, 3, -1, -1, 4, -1, -1, -1, // 10 - 0, -1, 5, 6, 1, -1, 7, 8, 2, 3, -1, -1, 4, -1, -1, -1, // 11 - 0, -1, 6, -1, 1, -1, 7, -1, 2, 3, -1, -1, 4, 5, -1, -1, // 12 - 0, -1, 6, -1, 1, -1, 7, 8, 2, 3, -1, -1, 4, 5, -1, -1, // 13 - 0, -1, 6, 7, 1, -1, 8, -1, 2, 3, -1, -1, 4, 5, -1, -1, // 14 - 0, -1, 6, 7, 1, -1, 8, 9, 2, 3, -1, -1, 4, 5, -1, -1, // 15 - 0, -1, 5, -1, 1, 2, 6, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 16 - 0, -1, 5, -1, 1, 2, 6, 7, 3, -1, -1, -1, 4, -1, -1, -1, // 17 - 0, -1, 5, 6, 1, 2, 7, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 18 - 0, -1, 5, 6, 1, 2, 7, 8, 3, -1, -1, -1, 4, -1, -1, -1, // 19 - 0, -1, 6, -1, 1, 2, 7, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 20 - 0, -1, 6, -1, 1, 2, 7, 8, 3, -1, -1, -1, 4, 5, -1, -1, // 21 - 0, -1, 6, 7, 1, 2, 8, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 22 - 0, -1, 6, 7, 1, 2, 8, 9, 3, -1, -1, -1, 4, 5, -1, -1, // 23 - 0, -1, 6, -1, 1, 2, 7, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 24 - 0, -1, 6, -1, 1, 2, 7, 8, 3, 4, -1, -1, 5, -1, -1, -1, // 25 - 0, -1, 6, 7, 1, 2, 8, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 26 - 0, -1, 6, 7, 1, 2, 8, 9, 3, 4, -1, -1, 5, -1, -1, -1, // 27 - 0, -1, 7, -1, 1, 2, 8, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 28 - 0, -1, 7, -1, 1, 2, 8, 9, 3, 4, -1, -1, 5, 6, -1, -1, // 29 - 0, -1, 7, 8, 1, 2, 9, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 30 - 0, -1, 7, 8, 1, 2, 9, 10, 3, 4, -1, -1, 5, 6, -1, -1, // 31 - 0, 1, 5, -1, 2, -1, 6, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 32 - 0, 1, 5, -1, 2, -1, 6, 7, 3, -1, -1, -1, 4, -1, -1, -1, // 33 - 0, 1, 5, 6, 2, -1, 7, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 34 - 0, 1, 5, 6, 2, -1, 7, 8, 3, -1, -1, -1, 4, -1, -1, -1, // 35 - 0, 1, 6, -1, 2, -1, 7, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 36 - 0, 1, 6, -1, 2, -1, 7, 8, 3, -1, -1, -1, 4, 5, -1, -1, // 37 - 0, 1, 6, 7, 2, -1, 8, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 38 - 0, 1, 6, 7, 2, -1, 8, 9, 3, -1, -1, -1, 4, 5, -1, -1, // 39 - 0, 1, 6, -1, 2, -1, 7, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 40 - 0, 1, 6, -1, 2, -1, 7, 8, 3, 4, -1, -1, 5, -1, -1, -1, // 41 - 0, 1, 6, 7, 2, -1, 8, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 42 - 0, 1, 6, 7, 2, -1, 8, 9, 3, 4, -1, -1, 5, -1, -1, -1, // 43 - 0, 1, 7, -1, 2, -1, 8, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 44 - 0, 1, 7, -1, 2, -1, 8, 9, 3, 4, -1, -1, 5, 6, -1, -1, // 45 - 0, 1, 7, 8, 2, -1, 9, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 46 - 0, 1, 7, 8, 2, -1, 9, 10, 3, 4, -1, -1, 5, 6, -1, -1, // 47 - 0, 1, 6, -1, 2, 3, 7, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 48 - 0, 1, 6, -1, 2, 3, 7, 8, 4, -1, -1, -1, 5, -1, -1, -1, // 49 - 0, 1, 6, 7, 2, 3, 8, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 50 - 0, 1, 6, 7, 2, 3, 8, 9, 4, -1, -1, -1, 5, -1, -1, -1, // 51 - 0, 1, 7, -1, 2, 3, 8, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 52 - 0, 1, 7, -1, 2, 3, 8, 9, 4, -1, -1, -1, 5, 6, -1, -1, // 53 - 0, 1, 7, 8, 2, 3, 9, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 54 - 0, 1, 7, 8, 2, 3, 9, 10, 4, -1, -1, -1, 5, 6, -1, -1, // 55 - 0, 1, 7, -1, 2, 3, 8, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 56 - 0, 1, 7, -1, 2, 3, 8, 9, 4, 5, -1, -1, 6, -1, -1, -1, // 57 - 0, 1, 7, 8, 2, 3, 9, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 58 - 0, 1, 7, 8, 2, 3, 9, 10, 4, 5, -1, -1, 6, -1, -1, -1, // 59 - 0, 1, 8, -1, 2, 3, 9, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 60 - 0, 1, 8, -1, 2, 3, 9, 10, 4, 5, -1, -1, 6, 7, -1, -1, // 61 - 0, 1, 8, 9, 2, 3, 10, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 62 - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, -1, -1, 6, 7, -1, -1, // 63 - 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, // 64 - 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, // 65 - 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, // 66 - 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, // 67 - 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, // 68 - 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, // 69 - 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, // 70 - 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, // 71 - 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, // 72 - 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 73 - 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 74 - 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1, // 75 - 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 76 - 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 77 - 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1, // 78 - 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1, // 79 - 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1, // 80 - 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1, // 81 - 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 82 - 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 83 - 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1, // 84 - 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 85 - 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 86 - 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1, // 87 - 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1, // 88 - 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1, // 89 - 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1, // 90 - 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 91 - 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 92 - 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1, // 93 - 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 94 - 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 95 - 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1, // 96 - 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1, // 97 - 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1, // 98 - 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1, // 99 - 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 100 - 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 101 - 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1, // 102 - 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 103 - 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 104 - 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1, // 105 - 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1, // 106 - 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1, // 107 - 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1, // 108 - 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1, // 109 - 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1, // 110 - 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1, // 111 - 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1, // 112 - 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1, // 113 - 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1, // 114 - 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1, // 115 - 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1, // 116 - 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1, // 117 - 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 118 - 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 119 - 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1, // 120 - 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 121 - 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 122 - 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1, // 123 - 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1, // 124 - 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1, // 125 - 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1, // 126 - 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, // 127 - 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1, // 128 - 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1, // 129 - 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1, // 130 - 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1, // 131 - 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1, // 132 - 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1, // 133 - 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1, // 134 - 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1, // 135 - 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1, // 136 - 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1, // 137 - 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1, // 138 - 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1, // 139 - 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1, // 140 - 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1, // 141 - 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1, // 142 - 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1, // 143 - 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1, // 144 - -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 1, // 145 - -1, -1, -1, -1, -1, -1, -1, 0, 2, -1, -1, -1, -1, -1, -1, 1, // 146 - -1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, -1, -1, -1, 1, // 147 - -1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, 4, -1, -1, 1, // 148 - -1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, 4, -1, 5, 1, // 149 - 1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 2, // 150 - 1, -1, -1, -1, -1, -1, -1, 0, 3, -1, -1, -1, -1, -1, -1, 2, // 151 - 1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, -1, -1, -1, 2, // 152 - 1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, 5, -1, -1, 2, // 153 - 1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, 5, -1, 6, 2, // 154 - 1, -1, 2, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 3, // 155 - 1, -1, 2, -1, -1, -1, -1, 0, 4, -1, -1, -1, -1, -1, -1, 3, // 156 - 1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, -1, -1, -1, 3, // 157 - 1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, 6, -1, -1, 3, // 158 - 1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, 6, -1, 7, 3, // 159 - 1, -1, 2, -1, 3, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 4, // 160 - 1, -1, 2, -1, 3, -1, -1, 0, 5, -1, -1, -1, -1, -1, -1, 4, // 161 - 1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, -1, -1, -1, 4, // 162 - 1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, 7, -1, -1, 4, // 163 - 1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, 7, -1, 8, 4, // 164 - 1, -1, 2, -1, 3, -1, 4, 0, -1, -1, -1, -1, -1, -1, -1, 5, // 165 - 1, -1, 2, -1, 3, -1, 4, 0, 6, -1, -1, -1, -1, -1, -1, 5, // 166 - 1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, -1, -1, -1, 5, // 167 - 1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, 8, -1, -1, 5, // 168 - 1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, 8, -1, 9, 5, // 169 + 0, -1, 4, -1, 1, -1, 5, -1, 2, -1, -1, -1, 3, -1, -1, -1, // 0 + 0, -1, 4, -1, 1, -1, 5, 6, 2, -1, -1, -1, 3, -1, -1, -1, // 1 + 0, -1, 4, 5, 1, -1, 6, -1, 2, -1, -1, -1, 3, -1, -1, -1, // 2 + 0, -1, 4, 5, 1, -1, 6, 7, 2, -1, -1, -1, 3, -1, -1, -1, // 3 + 0, -1, 5, -1, 1, -1, 6, -1, 2, -1, -1, -1, 3, 4, -1, -1, // 4 + 0, -1, 5, -1, 1, -1, 6, 7, 2, -1, -1, -1, 3, 4, -1, -1, // 5 + 0, -1, 5, 6, 1, -1, 7, -1, 2, -1, -1, -1, 3, 4, -1, -1, // 6 + 0, -1, 5, 6, 1, -1, 7, 8, 2, -1, -1, -1, 3, 4, -1, -1, // 7 + 0, -1, 5, -1, 1, -1, 6, -1, 2, 3, -1, -1, 4, -1, -1, -1, // 8 + 0, -1, 5, -1, 1, -1, 6, 7, 2, 3, -1, -1, 4, -1, -1, -1, // 9 + 0, -1, 5, 6, 1, -1, 7, -1, 2, 3, -1, -1, 4, -1, -1, -1, // 10 + 0, -1, 5, 6, 1, -1, 7, 8, 2, 3, -1, -1, 4, -1, -1, -1, // 11 + 0, -1, 6, -1, 1, -1, 7, -1, 2, 3, -1, -1, 4, 5, -1, -1, // 12 + 0, -1, 6, -1, 1, -1, 7, 8, 2, 3, -1, -1, 4, 5, -1, -1, // 13 + 0, -1, 6, 7, 1, -1, 8, -1, 2, 3, -1, -1, 4, 5, -1, -1, // 14 + 0, -1, 6, 7, 1, -1, 8, 9, 2, 3, -1, -1, 4, 5, -1, -1, // 15 + 0, -1, 5, -1, 1, 2, 6, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 16 + 0, -1, 5, -1, 1, 2, 6, 7, 3, -1, -1, -1, 4, -1, -1, -1, // 17 + 0, -1, 5, 6, 1, 2, 7, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 18 + 0, -1, 5, 6, 1, 2, 7, 8, 3, -1, -1, -1, 4, -1, -1, -1, // 19 + 0, -1, 6, -1, 1, 2, 7, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 20 + 0, -1, 6, -1, 1, 2, 7, 8, 3, -1, -1, -1, 4, 5, -1, -1, // 21 + 0, -1, 6, 7, 1, 2, 8, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 22 + 0, -1, 6, 7, 1, 2, 8, 9, 3, -1, -1, -1, 4, 5, -1, -1, // 23 + 0, -1, 6, -1, 1, 2, 7, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 24 + 0, -1, 6, -1, 1, 2, 7, 8, 3, 4, -1, -1, 5, -1, -1, -1, // 25 + 0, -1, 6, 7, 1, 2, 8, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 26 + 0, -1, 6, 7, 1, 2, 8, 9, 3, 4, -1, -1, 5, -1, -1, -1, // 27 + 0, -1, 7, -1, 1, 2, 8, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 28 + 0, -1, 7, -1, 1, 2, 8, 9, 3, 4, -1, -1, 5, 6, -1, -1, // 29 + 0, -1, 7, 8, 1, 2, 9, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 30 + 0, -1, 7, 8, 1, 2, 9, 10, 3, 4, -1, -1, 5, 6, -1, -1, // 31 + 0, 1, 5, -1, 2, -1, 6, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 32 + 0, 1, 5, -1, 2, -1, 6, 7, 3, -1, -1, -1, 4, -1, -1, -1, // 33 + 0, 1, 5, 6, 2, -1, 7, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 34 + 0, 1, 5, 6, 2, -1, 7, 8, 3, -1, -1, -1, 4, -1, -1, -1, // 35 + 0, 1, 6, -1, 2, -1, 7, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 36 + 0, 1, 6, -1, 2, -1, 7, 8, 3, -1, -1, -1, 4, 5, -1, -1, // 37 + 0, 1, 6, 7, 2, -1, 8, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 38 + 0, 1, 6, 7, 2, -1, 8, 9, 3, -1, -1, -1, 4, 5, -1, -1, // 39 + 0, 1, 6, -1, 2, -1, 7, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 40 + 0, 1, 6, -1, 2, -1, 7, 8, 3, 4, -1, -1, 5, -1, -1, -1, // 41 + 0, 1, 6, 7, 2, -1, 8, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 42 + 0, 1, 6, 7, 2, -1, 8, 9, 3, 4, -1, -1, 5, -1, -1, -1, // 43 + 0, 1, 7, -1, 2, -1, 8, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 44 + 0, 1, 7, -1, 2, -1, 8, 9, 3, 4, -1, -1, 5, 6, -1, -1, // 45 + 0, 1, 7, 8, 2, -1, 9, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 46 + 0, 1, 7, 8, 2, -1, 9, 10, 3, 4, -1, -1, 5, 6, -1, -1, // 47 + 0, 1, 6, -1, 2, 3, 7, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 48 + 0, 1, 6, -1, 2, 3, 7, 8, 4, -1, -1, -1, 5, -1, -1, -1, // 49 + 0, 1, 6, 7, 2, 3, 8, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 50 + 0, 1, 6, 7, 2, 3, 8, 9, 4, -1, -1, -1, 5, -1, -1, -1, // 51 + 0, 1, 7, -1, 2, 3, 8, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 52 + 0, 1, 7, -1, 2, 3, 8, 9, 4, -1, -1, -1, 5, 6, -1, -1, // 53 + 0, 1, 7, 8, 2, 3, 9, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 54 + 0, 1, 7, 8, 2, 3, 9, 10, 4, -1, -1, -1, 5, 6, -1, -1, // 55 + 0, 1, 7, -1, 2, 3, 8, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 56 + 0, 1, 7, -1, 2, 3, 8, 9, 4, 5, -1, -1, 6, -1, -1, -1, // 57 + 0, 1, 7, 8, 2, 3, 9, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 58 + 0, 1, 7, 8, 2, 3, 9, 10, 4, 5, -1, -1, 6, -1, -1, -1, // 59 + 0, 1, 8, -1, 2, 3, 9, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 60 + 0, 1, 8, -1, 2, 3, 9, 10, 4, 5, -1, -1, 6, 7, -1, -1, // 61 + 0, 1, 8, 9, 2, 3, 10, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 62 + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, -1, -1, 6, 7, -1, -1, // 63 + 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, // 64 + 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, // 65 + 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, // 66 + 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, // 67 + 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, // 68 + 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, // 69 + 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, // 70 + 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, // 71 + 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, // 72 + 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 73 + 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 74 + 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1, // 75 + 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 76 + 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 77 + 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1, // 78 + 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1, // 79 + 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1, // 80 + 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1, // 81 + 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 82 + 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 83 + 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1, // 84 + 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 85 + 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 86 + 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1, // 87 + 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1, // 88 + 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1, // 89 + 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1, // 90 + 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1, // 91 + 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1, // 92 + 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1, // 93 + 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1, // 94 + 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1, // 95 + 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1, // 96 + 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1, // 97 + 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1, // 98 + 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1, // 99 + 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 100 + 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 101 + 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1, // 102 + 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 103 + 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 104 + 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1, // 105 + 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1, // 106 + 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1, // 107 + 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1, // 108 + 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1, // 109 + 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1, // 110 + 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1, // 111 + 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1, // 112 + 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1, // 113 + 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1, // 114 + 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1, // 115 + 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1, // 116 + 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1, // 117 + 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, // 118 + 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1, // 119 + 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1, // 120 + 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1, // 121 + 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, // 122 + 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1, // 123 + 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1, // 124 + 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1, // 125 + 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1, // 126 + 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, // 127 + 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1, // 128 + 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1, // 129 + 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1, // 130 + 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1, // 131 + 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1, // 132 + 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1, // 133 + 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1, // 134 + 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1, // 135 + 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1, // 136 + 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1, // 137 + 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1, // 138 + 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1, // 139 + 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1, // 140 + 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1, // 141 + 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1, // 142 + 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1, // 143 + 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1, // 144 + -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 1, // 145 + -1, -1, -1, -1, -1, -1, -1, 0, 2, -1, -1, -1, -1, -1, -1, 1, // 146 + -1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, -1, -1, -1, 1, // 147 + -1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, 4, -1, -1, 1, // 148 + -1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, 4, -1, 5, 1, // 149 + 1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 2, // 150 + 1, -1, -1, -1, -1, -1, -1, 0, 3, -1, -1, -1, -1, -1, -1, 2, // 151 + 1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, -1, -1, -1, 2, // 152 + 1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, 5, -1, -1, 2, // 153 + 1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, 5, -1, 6, 2, // 154 + 1, -1, 2, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 3, // 155 + 1, -1, 2, -1, -1, -1, -1, 0, 4, -1, -1, -1, -1, -1, -1, 3, // 156 + 1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, -1, -1, -1, 3, // 157 + 1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, 6, -1, -1, 3, // 158 + 1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, 6, -1, 7, 3, // 159 + 1, -1, 2, -1, 3, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 4, // 160 + 1, -1, 2, -1, 3, -1, -1, 0, 5, -1, -1, -1, -1, -1, -1, 4, // 161 + 1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, -1, -1, -1, 4, // 162 + 1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, 7, -1, -1, 4, // 163 + 1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, 7, -1, 8, 4, // 164 + 1, -1, 2, -1, 3, -1, 4, 0, -1, -1, -1, -1, -1, -1, -1, 5, // 165 + 1, -1, 2, -1, 3, -1, 4, 0, 6, -1, -1, -1, -1, -1, -1, 5, // 166 + 1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, -1, -1, -1, 5, // 167 + 1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, 8, -1, -1, 5, // 168 + 1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, 8, -1, 9, 5, // 169 }; -static const __m128i* vectors = (const __m128i*)vectorsrawbytes; - -static uint8_t masked_vbyte_read_group(const uint8_t* in, uint32_t* out, - uint64_t mask, uint64_t* ints_read) { - __m128i initial = _mm_lddqu_si128((const __m128i *) (in)); - __m128i * mout = (__m128i *) out; - - if (!(mask & 0xFFFF)) { - __m128i result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout, result); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout + 1, result); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout + 2, result); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout + 3, result); - *ints_read = 16; - return 16; - } - - uint32_t low_12_bits = mask & 0xFFF; - // combine index and bytes consumed into a single lookup - index_bytes_consumed combined = combined_lookup[low_12_bits]; - uint8_t consumed = combined.bytes_consumed; - uint8_t index = combined.index; - - // Slightly slower to use combined than to lookup individually? - // uint64_t consumed = bytes_consumed[low_12_bits]; - // uint8_t index = vec_lookup[low_12_bits]; - - __m128i shuffle_vector = vectors[index]; - // __m128i shuffle_vector = {0, 0}; // speed check: 20% faster at large, less at small - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - _mm_storeu_si128(mout, unpacked_result_a); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - _mm_storel_epi64(mout+1, unpacked_result_b); - //_mm_storeu_si128(mout + 1, unpacked_result_b); // maybe faster to write 16 bytes? - return consumed; - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - _mm_storeu_si128(mout, result); - return consumed; - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - _mm_storel_epi64(mout, result); - //_mm_storeu_si128(mout, result); // maybe faster to write 16 bytes? - +static const __m128i *vectors = (const __m128i *)vectorsrawbytes; + +static uint8_t masked_vbyte_read_group(const uint8_t *in, uint32_t *out, + uint64_t mask, uint64_t *ints_read) { + __m128i initial = _mm_lddqu_si128((const __m128i *)(in)); + __m128i *mout = (__m128i *)out; + + if (!(mask & 0xFFFF)) { + __m128i result = _mm_cvtepi8_epi32(initial); + _mm_storeu_si128(mout, result); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + _mm_storeu_si128(mout + 1, result); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + _mm_storeu_si128(mout + 2, result); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + _mm_storeu_si128(mout + 3, result); + *ints_read = 16; + return 16; + } + + uint32_t low_12_bits = mask & 0xFFF; + // combine index and bytes consumed into a single lookup + index_bytes_consumed combined = combined_lookup[low_12_bits]; + uint8_t consumed = combined.bytes_consumed; + uint8_t index = combined.index; + + // Slightly slower to use combined than to lookup individually? + // uint64_t consumed = bytes_consumed[low_12_bits]; + // uint8_t index = vec_lookup[low_12_bits]; + + __m128i shuffle_vector = vectors[index]; + // __m128i shuffle_vector = {0, 0}; // speed check: 20% faster at large, + //less at small + + if (index < 64) { + *ints_read = 6; + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x007F)); + __m128i high_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x7F00)); + __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); + __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); + __m128i unpacked_result_a = + _mm_and_si128(packed_result, _mm_set1_epi32(0x0000FFFF)); + _mm_storeu_si128(mout, unpacked_result_a); + __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); + _mm_storel_epi64(mout + 1, unpacked_result_b); + //_mm_storeu_si128(mout + 1, unpacked_result_b); // maybe faster to write 16 + //bytes? + return consumed; + } + if (index < 145) { + + *ints_read = 4; + + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x0000007F)); + __m128i middle_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x00007F00)); + __m128i high_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x007F0000)); + __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); + __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); + __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); + __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); + _mm_storeu_si128(mout, result); return consumed; + } + + *ints_read = 2; + + __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); + __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); + __m128i split_bytes = _mm_mullo_epi16( + bytes_to_decode, _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); + __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); + __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); + __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); + __m128i result_evens = _mm_or_si128(recombined, low_byte); + __m128i result = _mm_shuffle_epi8( + result_evens, + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1)); + _mm_storel_epi64(mout, result); + //_mm_storeu_si128(mout, result); // maybe faster to write 16 bytes? + + return consumed; } static inline __m128i PrefixSum(__m128i curr, __m128i prev) { - __m128i Add = _mm_slli_si128(curr, 4); // Cycle 1: [- A B C] (already done) - prev = _mm_shuffle_epi32(prev, 0xff); // Cycle 2: [P P P P] - curr = _mm_add_epi32(curr, Add); // Cycle 2: [A AB BC CD] - Add = _mm_slli_si128(curr, 8); // Cycle 3: [- - A AB] - curr = _mm_add_epi32(curr, prev); // Cycle 3: [PA PAB PBC PCD] - curr = _mm_add_epi32(curr, Add); // Cycle 4: [PA PAB PABC PABCD] - return curr; + __m128i Add = _mm_slli_si128(curr, 4); // Cycle 1: [- A B C] (already done) + prev = _mm_shuffle_epi32(prev, 0xff); // Cycle 2: [P P P P] + curr = _mm_add_epi32(curr, Add); // Cycle 2: [A AB BC CD] + Add = _mm_slli_si128(curr, 8); // Cycle 3: [- - A AB] + curr = _mm_add_epi32(curr, prev); // Cycle 3: [PA PAB PBC PCD] + curr = _mm_add_epi32(curr, Add); // Cycle 4: [PA PAB PABC PABCD] + return curr; } // only the first two ints of curr are meaningful, rest is garbage to beignored static inline __m128i PrefixSum2ints(__m128i curr, __m128i prev) { - __m128i Add = _mm_slli_si128(curr, 4); // Cycle 1: [- A B G] (already done) - prev = _mm_shuffle_epi32(prev, 0xff); // Cycle 2: [P P P P] - curr = _mm_add_epi32(curr, Add); // Cycle 2: [A AB BG GG] - curr = _mm_shuffle_epi32(curr, 0x54); //Cycle 3:[A AB AB AB] - curr = _mm_add_epi32(curr, prev); // Cycle 4: [PA PAB PAB PAB] - return curr; + __m128i Add = _mm_slli_si128(curr, 4); // Cycle 1: [- A B G] (already done) + prev = _mm_shuffle_epi32(prev, 0xff); // Cycle 2: [P P P P] + curr = _mm_add_epi32(curr, Add); // Cycle 2: [A AB BG GG] + curr = _mm_shuffle_epi32(curr, 0x54); // Cycle 3:[A AB AB AB] + curr = _mm_add_epi32(curr, prev); // Cycle 4: [PA PAB PAB PAB] + return curr; } -static uint8_t masked_vbyte_read_group_delta(const uint8_t* in, uint32_t* out, - uint64_t mask, uint64_t* ints_read, __m128i * prev) { - __m128i initial = _mm_lddqu_si128((const __m128i *) (in)); - __m128i * mout = (__m128i *) out; - - if (!(mask & 0xFFFF)) { - __m128i result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout, *prev); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 1, *prev); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 2, *prev); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 3, *prev); - *ints_read = 16; - return 16; - } +static uint8_t masked_vbyte_read_group_delta(const uint8_t *in, uint32_t *out, + uint64_t mask, uint64_t *ints_read, + __m128i *prev) { + __m128i initial = _mm_lddqu_si128((const __m128i *)(in)); + __m128i *mout = (__m128i *)out; - uint32_t low_12_bits = mask & 0xFFF; - // combine index and bytes consumed into a single lookup - index_bytes_consumed combined = combined_lookup[low_12_bits]; - uint8_t consumed = combined.bytes_consumed; - uint8_t index = combined.index; - - __m128i shuffle_vector = vectors[index]; - // __m128i shuffle_vector = {0, 0}; // speed check: 20% faster at large, less at small - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - *prev = PrefixSum(unpacked_result_a, *prev); - _mm_storeu_si128(mout, *prev); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - *prev = PrefixSum2ints(unpacked_result_b, *prev); - _mm_storel_epi64(mout + 1, *prev); -// _mm_storeu_si128(mout + 1, *prev); - return consumed; - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout, *prev); - return consumed; - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - *prev = PrefixSum2ints(result, *prev); - //_mm_storeu_si128(mout, *prev); - _mm_storel_epi64(mout, *prev); + if (!(mask & 0xFFFF)) { + __m128i result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout, *prev); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout + 1, *prev); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout + 2, *prev); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout + 3, *prev); + *ints_read = 16; + return 16; + } + + uint32_t low_12_bits = mask & 0xFFF; + // combine index and bytes consumed into a single lookup + index_bytes_consumed combined = combined_lookup[low_12_bits]; + uint8_t consumed = combined.bytes_consumed; + uint8_t index = combined.index; + + __m128i shuffle_vector = vectors[index]; + // __m128i shuffle_vector = {0, 0}; // speed check: 20% faster at large, + //less at small + + if (index < 64) { + *ints_read = 6; + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x007F)); + __m128i high_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x7F00)); + __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); + __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); + __m128i unpacked_result_a = + _mm_and_si128(packed_result, _mm_set1_epi32(0x0000FFFF)); + *prev = PrefixSum(unpacked_result_a, *prev); + _mm_storeu_si128(mout, *prev); + __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); + *prev = PrefixSum2ints(unpacked_result_b, *prev); + _mm_storel_epi64(mout + 1, *prev); + // _mm_storeu_si128(mout + 1, *prev); return consumed; + } + if (index < 145) { + + *ints_read = 4; + + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x0000007F)); + __m128i middle_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x00007F00)); + __m128i high_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x007F0000)); + __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); + __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); + __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); + __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout, *prev); + return consumed; + } + + *ints_read = 2; + + __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); + __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); + __m128i split_bytes = _mm_mullo_epi16( + bytes_to_decode, _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); + __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); + __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); + __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); + __m128i result_evens = _mm_or_si128(recombined, low_byte); + __m128i result = _mm_shuffle_epi8( + result_evens, + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1)); + *prev = PrefixSum2ints(result, *prev); + //_mm_storeu_si128(mout, *prev); + _mm_storel_epi64(mout, *prev); + return consumed; } - -static int read_int_group(const uint8_t* in, uint32_t* out, int* ints_read) { - - __m128i initial = _mm_lddqu_si128((const __m128i *) in); - __m128i * const mout = (__m128i *) out; - - int mask = _mm_movemask_epi8(initial); - if (mask == 0) { - __m128i result; - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - _mm_storeu_si128(mout, result); - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - _mm_storeu_si128(mout + 1, result); - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - _mm_storeu_si128(mout + 2, result); - result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout + 3, result); - *ints_read = 16; - return 16; - } - int mask2 = mask & 0xFFF; - index_bytes_consumed combined = combined_lookup[mask2]; - - int index = combined.index; - - __m128i shuffle_vector = vectors[index]; - int consumed = combined.bytes_consumed; - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - _mm_storeu_si128(mout, unpacked_result_a); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - _mm_storel_epi64(mout + 1, unpacked_result_b); - return consumed; - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - _mm_storeu_si128(mout, result); - return consumed; - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - - _mm_storel_epi64(mout, result); +static int read_int_group(const uint8_t *in, uint32_t *out, int *ints_read) { + + __m128i initial = _mm_lddqu_si128((const __m128i *)in); + __m128i *const mout = (__m128i *)out; + + int mask = _mm_movemask_epi8(initial); + if (mask == 0) { + __m128i result; + result = _mm_cvtepi8_epi32(initial); + initial = _mm_srli_si128(initial, 4); + _mm_storeu_si128(mout, result); + result = _mm_cvtepi8_epi32(initial); + initial = _mm_srli_si128(initial, 4); + _mm_storeu_si128(mout + 1, result); + result = _mm_cvtepi8_epi32(initial); + initial = _mm_srli_si128(initial, 4); + _mm_storeu_si128(mout + 2, result); + result = _mm_cvtepi8_epi32(initial); + _mm_storeu_si128(mout + 3, result); + *ints_read = 16; + return 16; + } + int mask2 = mask & 0xFFF; + index_bytes_consumed combined = combined_lookup[mask2]; + + int index = combined.index; + + __m128i shuffle_vector = vectors[index]; + int consumed = combined.bytes_consumed; + + if (index < 64) { + *ints_read = 6; + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x007F)); + __m128i high_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x7F00)); + __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); + __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); + __m128i unpacked_result_a = + _mm_and_si128(packed_result, _mm_set1_epi32(0x0000FFFF)); + _mm_storeu_si128(mout, unpacked_result_a); + __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); + _mm_storel_epi64(mout + 1, unpacked_result_b); + return consumed; + } + if (index < 145) { + + *ints_read = 4; + + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x0000007F)); + __m128i middle_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x00007F00)); + __m128i high_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x007F0000)); + __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); + __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); + __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); + __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); + _mm_storeu_si128(mout, result); return consumed; + } + + *ints_read = 2; + + __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); + __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); + __m128i split_bytes = _mm_mullo_epi16( + bytes_to_decode, _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); + __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); + __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); + __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); + __m128i result_evens = _mm_or_si128(recombined, low_byte); + __m128i result = _mm_shuffle_epi8( + result_evens, + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1)); + + _mm_storel_epi64(mout, result); + return consumed; } - // len_signed : number of ints we want to decode -size_t masked_vbyte_read_loop(const uint8_t* in, uint32_t* out, +size_t masked_vbyte_read_loop(const uint8_t *in, uint32_t *out, uint64_t length) { - //uint64_t length = (uint64_t) len_signed; // number of ints we want to decode - size_t consumed = 0; // number of bytes read - uint64_t count = 0; // how many integers we have read so far - - uint64_t sig = 0; - uint64_t availablebytes = 0; - if (96 < length) { - size_t scanned = 0; + // uint64_t length = (uint64_t) len_signed; // number of ints we want to + // decode + size_t consumed = 0; // number of bytes read + uint64_t count = 0; // how many integers we have read so far + uint64_t sig = 0; + uint64_t availablebytes = 0; + if (96 < length) { + size_t scanned = 0; #ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); + __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); + uint32_t lowSig = _mm256_movemask_epi8(low); #else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; + __m128i low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + uint32_t lowSig1 = _mm_movemask_epi8(low1); + __m128i low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + uint32_t lowSig2 = _mm_movemask_epi8(low2); + uint32_t lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; + // excess verbosity to avoid problems with sign extension on conversions + // better to think about what's happening and make it clearer + __m128i high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + uint32_t highSig = _mm_movemask_epi8(high); + uint64_t nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + scanned += 48; - do { - uint64_t thisSig = nextSig; + do { + uint64_t thisSig = nextSig; #ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); + low = _mm256_loadu_si256((__m256i *)(in + scanned)); + lowSig = _mm256_movemask_epi8(low); #else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; + low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + lowSig1 = _mm_movemask_epi8(low1); + low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + lowSig2 = _mm_movemask_epi8(low2); + lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read; - uint8_t bytes = masked_vbyte_read_group(in + consumed, - out + count, sig, &ints_read); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - count += ints_read; - } - } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to 16 remaining in sig - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } - while (availablebytes + count < length) { - if (availablebytes < 16) { - if (availablebytes + count + 31 < length) { -#ifdef __AVX2__ - uint64_t newsigavx = (uint32_t) _mm256_movemask_epi8(_mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); - sig |= (newsigavx << availablebytes); -#else - uint64_t newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - uint64_t newsig2 = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes + 16 - + consumed))); - sig |= (newsig << availablebytes) - | (newsig2 << (availablebytes + 16)); -#endif - availablebytes += 32; - } else if (availablebytes + count + 15 < length) { - int newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - sig |= newsig << availablebytes; - availablebytes += 16; - } else { - break; - } - } + high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + highSig = _mm_movemask_epi8(high); + nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + + uint64_t remaining = scanned - (consumed + 48); + sig = (thisSig << remaining) | sig; + + uint64_t reload = scanned - 16; + scanned += 48; + + // need to reload when less than 16 scanned bytes remain in sig + while (consumed < reload) { uint64_t ints_read; + uint8_t bytes = masked_vbyte_read_group(in + consumed, out + count, sig, + &ints_read); + sig >>= bytes; - uint8_t eaten = masked_vbyte_read_group(in + consumed, out + count, - sig, &ints_read); - consumed += eaten; - availablebytes -= eaten; - sig >>= eaten; + // seems like this might force the compiler to prioritize shifting sig + // >>= bytes + if (sig == 0xFFFFFFFFFFFFFFFF) + return 0; // fake check to force earliest evaluation + + consumed += bytes; count += ints_read; + } + } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to + // 16 remaining in sig + sig = (nextSig << (scanned - consumed - 48)) | sig; + availablebytes = scanned - consumed; + } + while (availablebytes + count < length) { + if (availablebytes < 16) { + if (availablebytes + count + 31 < length) { +#ifdef __AVX2__ + uint64_t newsigavx = (uint32_t)_mm256_movemask_epi8( + _mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); + sig |= (newsigavx << availablebytes); +#else + uint64_t newsig = _mm_movemask_epi8( + _mm_lddqu_si128((const __m128i *)(in + availablebytes + consumed))); + uint64_t newsig2 = _mm_movemask_epi8(_mm_lddqu_si128( + (const __m128i *)(in + availablebytes + 16 + consumed))); + sig |= (newsig << availablebytes) | (newsig2 << (availablebytes + 16)); +#endif + availablebytes += 32; + } else if (availablebytes + count + 15 < length) { + int newsig = _mm_movemask_epi8( + _mm_lddqu_si128((const __m128i *)(in + availablebytes + consumed))); + sig |= newsig << availablebytes; + availablebytes += 16; + } else { + break; + } } - for (; count < length; count++) { - consumed += read_int(in + consumed, out + count); - } - return consumed; + uint64_t ints_read; + + uint8_t eaten = + masked_vbyte_read_group(in + consumed, out + count, sig, &ints_read); + consumed += eaten; + availablebytes -= eaten; + sig >>= eaten; + count += ints_read; + } + for (; count < length; count++) { + consumed += read_int(in + consumed, out + count); + } + return consumed; } - // inputsize : number of input bytes we want to decode // returns the number of written ints -size_t masked_vbyte_read_loop_fromcompressedsize(const uint8_t* in, uint32_t* out, - size_t inputsize) { - size_t consumed = 0; // number of bytes read - uint32_t * initout = out; - - uint64_t sig = 0; - uint64_t availablebytes = 0; - if (96 < inputsize) { - size_t scanned = 0; +size_t masked_vbyte_read_loop_fromcompressedsize(const uint8_t *in, + uint32_t *out, + size_t inputsize) { + size_t consumed = 0; // number of bytes read + uint32_t *initout = out; + uint64_t sig = 0; + uint64_t availablebytes = 0; + if (96 < inputsize) { + size_t scanned = 0; #ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); + __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); + uint32_t lowSig = _mm256_movemask_epi8(low); #else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; + __m128i low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + uint32_t lowSig1 = _mm_movemask_epi8(low1); + __m128i low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + uint32_t lowSig2 = _mm_movemask_epi8(low2); + uint32_t lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; + // excess verbosity to avoid problems with sign extension on conversions + // better to think about what's happening and make it clearer + __m128i high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + uint32_t highSig = _mm_movemask_epi8(high); + uint64_t nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + scanned += 48; - do { - uint64_t thisSig = nextSig; + do { + uint64_t thisSig = nextSig; #ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); + low = _mm256_loadu_si256((__m256i *)(in + scanned)); + lowSig = _mm256_movemask_epi8(low); #else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; + low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + lowSig1 = _mm_movemask_epi8(low1); + low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + lowSig2 = _mm_movemask_epi8(low2); + lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read; - uint8_t bytes = masked_vbyte_read_group(in + consumed, - out, sig, &ints_read); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - out += ints_read; - } - } while (scanned + 112 < inputsize); // 112 == 48 + 48 ahead for scanning + up to 16 remaining in sig - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } - while (1) { - if (availablebytes < 16) { - if (availablebytes + consumed + 31 < inputsize) { -#ifdef __AVX2__ - uint64_t newsigavx = (uint32_t) _mm256_movemask_epi8(_mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); - sig |= (newsigavx << availablebytes); -#else - uint64_t newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - uint64_t newsig2 = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes + 16 - + consumed))); - sig |= (newsig << availablebytes) - | (newsig2 << (availablebytes + 16)); -#endif - availablebytes += 32; - } else if(availablebytes + consumed + 15 < inputsize ) { - int newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - sig |= newsig << availablebytes; - availablebytes += 16; - } else { - break; - } - } + high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + highSig = _mm_movemask_epi8(high); + nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + + uint64_t remaining = scanned - (consumed + 48); + sig = (thisSig << remaining) | sig; + + uint64_t reload = scanned - 16; + scanned += 48; + + // need to reload when less than 16 scanned bytes remain in sig + while (consumed < reload) { uint64_t ints_read; - uint8_t bytes = masked_vbyte_read_group(in + consumed, out, - sig, &ints_read); - consumed += bytes; - availablebytes -= bytes; + uint8_t bytes = + masked_vbyte_read_group(in + consumed, out, sig, &ints_read); sig >>= bytes; + + // seems like this might force the compiler to prioritize shifting sig + // >>= bytes + if (sig == 0xFFFFFFFFFFFFFFFF) + return 0; // fake check to force earliest evaluation + + consumed += bytes; out += ints_read; + } + } while (scanned + 112 < inputsize); // 112 == 48 + 48 ahead for scanning + + // up to 16 remaining in sig + sig = (nextSig << (scanned - consumed - 48)) | sig; + availablebytes = scanned - consumed; + } + while (1) { + if (availablebytes < 16) { + if (availablebytes + consumed + 31 < inputsize) { +#ifdef __AVX2__ + uint64_t newsigavx = (uint32_t)_mm256_movemask_epi8( + _mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); + sig |= (newsigavx << availablebytes); +#else + uint64_t newsig = _mm_movemask_epi8( + _mm_lddqu_si128((const __m128i *)(in + availablebytes + consumed))); + uint64_t newsig2 = _mm_movemask_epi8(_mm_lddqu_si128( + (const __m128i *)(in + availablebytes + 16 + consumed))); + sig |= (newsig << availablebytes) | (newsig2 << (availablebytes + 16)); +#endif + availablebytes += 32; + } else if (availablebytes + consumed + 15 < inputsize) { + int newsig = _mm_movemask_epi8( + _mm_lddqu_si128((const __m128i *)(in + availablebytes + consumed))); + sig |= newsig << availablebytes; + availablebytes += 16; + } else { + break; + } } - while (consumed < inputsize) { - unsigned int shift = 0; - for (uint32_t v = 0; consumed < inputsize; shift += 7) { - uint8_t c = in[consumed++]; - if ((c & 128) == 0) { - out[0] = v + (c << shift); - ++out; - break; - } else { - v += (c & 127) << shift; - } - } + uint64_t ints_read; + uint8_t bytes = + masked_vbyte_read_group(in + consumed, out, sig, &ints_read); + consumed += bytes; + availablebytes -= bytes; + sig >>= bytes; + out += ints_read; + } + while (consumed < inputsize) { + unsigned int shift = 0; + for (uint32_t v = 0; consumed < inputsize; shift += 7) { + uint8_t c = in[consumed++]; + if ((c & 128) == 0) { + out[0] = v + (c << shift); + ++out; + break; + } else { + v += (c & 127) << shift; + } } - return out - initout; + } + return out - initout; } - -size_t read_ints(const uint8_t* in, uint32_t* out, int length) { - size_t consumed = 0; - int count; - for (count = 0; count + 15 < length;) { - int ints_read; - consumed += read_int_group(in + consumed, out + count, &ints_read); - count += ints_read; - } - for (; count < length; count++) { - consumed += read_int(in + consumed, out + count); - } - return consumed; +size_t read_ints(const uint8_t *in, uint32_t *out, int length) { + size_t consumed = 0; + int count; + for (count = 0; count + 15 < length;) { + int ints_read; + consumed += read_int_group(in + consumed, out + count, &ints_read); + count += ints_read; + } + for (; count < length; count++) { + consumed += read_int(in + consumed, out + count); + } + return consumed; } -static int read_int_group_delta(const uint8_t* in, uint32_t* out, - int* ints_read, __m128i * prev) { - - __m128i initial = _mm_lddqu_si128((const __m128i *) in); - __m128i * const mout = (__m128i *) out; - - int mask = _mm_movemask_epi8(initial); - if (mask == 0) { - __m128i result; - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout, *prev); - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 1, *prev); - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 2, *prev); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 3, *prev); - *ints_read = 16; - return 16; - } - int mask2 = mask & 0xFFF; - index_bytes_consumed combined = combined_lookup[mask2]; - - int index = combined.index; - - __m128i shuffle_vector = vectors[index]; - int consumed = combined.bytes_consumed; - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - *prev = PrefixSum(unpacked_result_a, *prev); - _mm_storeu_si128(mout, *prev); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - *prev = PrefixSum2ints(unpacked_result_b, *prev); - _mm_storeu_si128(mout + 1, *prev); - return consumed; - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout, *prev); - - return consumed; - } +static int read_int_group_delta(const uint8_t *in, uint32_t *out, + int *ints_read, __m128i *prev) { + + __m128i initial = _mm_lddqu_si128((const __m128i *)in); + __m128i *const mout = (__m128i *)out; - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - *prev = PrefixSum2ints(result, *prev); + int mask = _mm_movemask_epi8(initial); + if (mask == 0) { + __m128i result; + result = _mm_cvtepi8_epi32(initial); + initial = _mm_srli_si128(initial, 4); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout, *prev); + result = _mm_cvtepi8_epi32(initial); + initial = _mm_srli_si128(initial, 4); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout + 1, *prev); + result = _mm_cvtepi8_epi32(initial); + initial = _mm_srli_si128(initial, 4); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout + 2, *prev); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout + 3, *prev); + *ints_read = 16; + return 16; + } + int mask2 = mask & 0xFFF; + index_bytes_consumed combined = combined_lookup[mask2]; + + int index = combined.index; + + __m128i shuffle_vector = vectors[index]; + int consumed = combined.bytes_consumed; + + if (index < 64) { + *ints_read = 6; + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x007F)); + __m128i high_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x7F00)); + __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); + __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); + __m128i unpacked_result_a = + _mm_and_si128(packed_result, _mm_set1_epi32(0x0000FFFF)); + *prev = PrefixSum(unpacked_result_a, *prev); _mm_storeu_si128(mout, *prev); + __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); + *prev = PrefixSum2ints(unpacked_result_b, *prev); + _mm_storeu_si128(mout + 1, *prev); return consumed; -} + } + if (index < 145) { + + *ints_read = 4; + + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x0000007F)); + __m128i middle_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x00007F00)); + __m128i high_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x007F0000)); + __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); + __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); + __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); + __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); + *prev = PrefixSum(result, *prev); + _mm_storeu_si128(mout, *prev); + return consumed; + } + + *ints_read = 2; + + __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); + __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); + __m128i split_bytes = _mm_mullo_epi16( + bytes_to_decode, _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); + __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); + __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); + __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); + __m128i result_evens = _mm_or_si128(recombined, low_byte); + __m128i result = _mm_shuffle_epi8( + result_evens, + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1)); + *prev = PrefixSum2ints(result, *prev); + _mm_storeu_si128(mout, *prev); + return consumed; +} // len_signed : number of ints we want to decode -size_t masked_vbyte_read_loop_delta(const uint8_t* in, uint32_t* out, +size_t masked_vbyte_read_loop_delta(const uint8_t *in, uint32_t *out, uint64_t length, uint32_t prev) { - //uint64_t length = (uint64_t) len_signed; // number of ints we want to decode - size_t consumed = 0; // number of bytes read - __m128i mprev = _mm_set1_epi32(prev); - uint64_t count = 0; // how many integers we have read so far - - uint64_t sig = 0; - int availablebytes = 0; - if (96 < length) { - size_t scanned = 0; + // uint64_t length = (uint64_t) len_signed; // number of ints we want to + // decode + size_t consumed = 0; // number of bytes read + __m128i mprev = _mm_set1_epi32(prev); + uint64_t count = 0; // how many integers we have read so far + uint64_t sig = 0; + int availablebytes = 0; + if (96 < length) { + size_t scanned = 0; #ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); + __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); + uint32_t lowSig = _mm256_movemask_epi8(low); #else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; + __m128i low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + uint32_t lowSig1 = _mm_movemask_epi8(low1); + __m128i low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + uint32_t lowSig2 = _mm_movemask_epi8(low2); + uint32_t lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; + // excess verbosity to avoid problems with sign extension on conversions + // better to think about what's happening and make it clearer + __m128i high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + uint32_t highSig = _mm_movemask_epi8(high); + uint64_t nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + scanned += 48; - do { - uint64_t thisSig = nextSig; + do { + uint64_t thisSig = nextSig; #ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); + low = _mm256_loadu_si256((__m256i *)(in + scanned)); + lowSig = _mm256_movemask_epi8(low); #else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; + low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + lowSig1 = _mm_movemask_epi8(low1); + low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + lowSig2 = _mm_movemask_epi8(low2); + lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read; - uint8_t bytes = masked_vbyte_read_group_delta(in + consumed, - out + count, sig, &ints_read, &mprev); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - count += ints_read; - } - } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to 16 remaining in sig - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = (int) (scanned - consumed); - } - while (availablebytes + count < length) { - if (availablebytes < 16) break; + high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + highSig = _mm_movemask_epi8(high); + nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + + uint64_t remaining = scanned - (consumed + 48); + sig = (thisSig << remaining) | sig; + + uint64_t reload = scanned - 16; + scanned += 48; + // need to reload when less than 16 scanned bytes remain in sig + while (consumed < reload) { uint64_t ints_read; - uint8_t eaten = masked_vbyte_read_group_delta(in + consumed, out + count, - sig, &ints_read, &mprev); - consumed += eaten; - availablebytes -= eaten; - sig >>= eaten; + uint8_t bytes = masked_vbyte_read_group_delta( + in + consumed, out + count, sig, &ints_read, &mprev); + sig >>= bytes; + + // seems like this might force the compiler to prioritize shifting sig + // >>= bytes + if (sig == 0xFFFFFFFFFFFFFFFF) + return 0; // fake check to force earliest evaluation + + consumed += bytes; count += ints_read; - } - prev = _mm_extract_epi32(mprev, 3); - for (; count < length; count++) { - consumed += read_int_delta(in + consumed, out + count, &prev); - } - return consumed; + } + } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to + // 16 remaining in sig + sig = (nextSig << (scanned - consumed - 48)) | sig; + availablebytes = (int)(scanned - consumed); + } + while (availablebytes + count < length) { + if (availablebytes < 16) + break; + + uint64_t ints_read; + uint8_t eaten = masked_vbyte_read_group_delta(in + consumed, out + count, + sig, &ints_read, &mprev); + consumed += eaten; + availablebytes -= eaten; + sig >>= eaten; + count += ints_read; + } + prev = _mm_extract_epi32(mprev, 3); + for (; count < length; count++) { + consumed += read_int_delta(in + consumed, out + count, &prev); + } + return consumed; } -size_t read_ints_delta(const uint8_t* in, uint32_t* out, int length, +size_t read_ints_delta(const uint8_t *in, uint32_t *out, int length, uint32_t prev) { - __m128i mprev = _mm_set1_epi32(prev); - size_t consumed = 0; - int count; - for (count = 0; count + 15 < length;) { - int ints_read; - consumed += read_int_group_delta(in + consumed, out + count, &ints_read, - &mprev); - count += ints_read; - } - prev = _mm_extract_epi32(mprev, 3); - for (; count < length; count++) { - consumed += read_int_delta(in + consumed, out + count, &prev); - } - return consumed; + __m128i mprev = _mm_set1_epi32(prev); + size_t consumed = 0; + int count; + for (count = 0; count + 15 < length;) { + int ints_read; + consumed += + read_int_group_delta(in + consumed, out + count, &ints_read, &mprev); + count += ints_read; + } + prev = _mm_extract_epi32(mprev, 3); + for (; count < length; count++) { + consumed += read_int_delta(in + consumed, out + count, &prev); + } + return consumed; } - - // inputsize : number of input bytes we want to decode // returns the number of written ints -size_t masked_vbyte_read_loop_fromcompressedsize_delta(const uint8_t* in, uint32_t* out, - size_t inputsize, uint32_t prev) { - size_t consumed = 0; // number of bytes read - uint32_t * initout = out; - __m128i mprev = _mm_set1_epi32(prev); - uint64_t sig = 0; - uint64_t availablebytes = 0; - if (96 < inputsize) { - size_t scanned = 0; - +size_t masked_vbyte_read_loop_fromcompressedsize_delta(const uint8_t *in, + uint32_t *out, + size_t inputsize, + uint32_t prev) { + size_t consumed = 0; // number of bytes read + uint32_t *initout = out; + __m128i mprev = _mm_set1_epi32(prev); + uint64_t sig = 0; + uint64_t availablebytes = 0; + if (96 < inputsize) { + size_t scanned = 0; #ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); + __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); + uint32_t lowSig = _mm256_movemask_epi8(low); #else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; + __m128i low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + uint32_t lowSig1 = _mm_movemask_epi8(low1); + __m128i low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + uint32_t lowSig2 = _mm_movemask_epi8(low2); + uint32_t lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; + // excess verbosity to avoid problems with sign extension on conversions + // better to think about what's happening and make it clearer + __m128i high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + uint32_t highSig = _mm_movemask_epi8(high); + uint64_t nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + scanned += 48; - do { - uint64_t thisSig = nextSig; + do { + uint64_t thisSig = nextSig; #ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); + low = _mm256_loadu_si256((__m256i *)(in + scanned)); + lowSig = _mm256_movemask_epi8(low); #else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; + low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + lowSig1 = _mm_movemask_epi8(low1); + low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + lowSig2 = _mm_movemask_epi8(low2); + lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read; - uint8_t bytes = masked_vbyte_read_group_delta(in + consumed, - out, sig, &ints_read, &mprev); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - out += ints_read; - } - } while (scanned + 112 < inputsize); // 112 == 48 + 48 ahead for scanning + up to 16 remaining in sig - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } - while (1) { - if (availablebytes < 16) { - if (availablebytes + consumed + 31 < inputsize) { -#ifdef __AVX2__ - uint64_t newsigavx = (uint32_t) _mm256_movemask_epi8(_mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); - sig |= (newsigavx << availablebytes); -#else - uint64_t newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - uint64_t newsig2 = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes + 16 - + consumed))); - sig |= (newsig << availablebytes) - | (newsig2 << (availablebytes + 16)); -#endif - availablebytes += 32; - } else if(availablebytes + consumed + 15 < inputsize ) { - int newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - sig |= newsig << availablebytes; - availablebytes += 16; - } else { - break; - } - } + high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + highSig = _mm_movemask_epi8(high); + nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + + uint64_t remaining = scanned - (consumed + 48); + sig = (thisSig << remaining) | sig; + + uint64_t reload = scanned - 16; + scanned += 48; + + // need to reload when less than 16 scanned bytes remain in sig + while (consumed < reload) { uint64_t ints_read; - uint8_t bytes = masked_vbyte_read_group_delta(in + consumed, out, - sig, &ints_read, &mprev); - consumed += bytes; - availablebytes -= bytes; + uint8_t bytes = masked_vbyte_read_group_delta(in + consumed, out, sig, + &ints_read, &mprev); sig >>= bytes; + + // seems like this might force the compiler to prioritize shifting sig + // >>= bytes + if (sig == 0xFFFFFFFFFFFFFFFF) + return 0; // fake check to force earliest evaluation + + consumed += bytes; out += ints_read; + } + } while (scanned + 112 < inputsize); // 112 == 48 + 48 ahead for scanning + + // up to 16 remaining in sig + sig = (nextSig << (scanned - consumed - 48)) | sig; + availablebytes = scanned - consumed; + } + while (1) { + if (availablebytes < 16) { + if (availablebytes + consumed + 31 < inputsize) { +#ifdef __AVX2__ + uint64_t newsigavx = (uint32_t)_mm256_movemask_epi8( + _mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); + sig |= (newsigavx << availablebytes); +#else + uint64_t newsig = _mm_movemask_epi8( + _mm_lddqu_si128((const __m128i *)(in + availablebytes + consumed))); + uint64_t newsig2 = _mm_movemask_epi8(_mm_lddqu_si128( + (const __m128i *)(in + availablebytes + 16 + consumed))); + sig |= (newsig << availablebytes) | (newsig2 << (availablebytes + 16)); +#endif + availablebytes += 32; + } else if (availablebytes + consumed + 15 < inputsize) { + int newsig = _mm_movemask_epi8( + _mm_lddqu_si128((const __m128i *)(in + availablebytes + consumed))); + sig |= newsig << availablebytes; + availablebytes += 16; + } else { + break; + } } - prev = _mm_extract_epi32(mprev, 3); - while (consumed < inputsize) { - unsigned int shift = 0; - for (uint32_t v = 0; consumed < inputsize; shift += 7) { - uint8_t c = in[consumed++]; - if ((c & 128) == 0) { - uint32_t delta = v + (c << shift); - prev += delta; - *out++ = prev; - break; - } else { - v += (c & 127) << shift; - } - } + uint64_t ints_read; + uint8_t bytes = masked_vbyte_read_group_delta(in + consumed, out, sig, + &ints_read, &mprev); + consumed += bytes; + availablebytes -= bytes; + sig >>= bytes; + out += ints_read; + } + prev = _mm_extract_epi32(mprev, 3); + while (consumed < inputsize) { + unsigned int shift = 0; + for (uint32_t v = 0; consumed < inputsize; shift += 7) { + uint8_t c = in[consumed++]; + if ((c & 128) == 0) { + uint32_t delta = v + (c << shift); + prev += delta; + *out++ = prev; + break; + } else { + v += (c & 127) << shift; + } } - return out - initout; + } + return out - initout; } -static SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes1[16 * 16 ] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +static SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes1[16 * 16] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, + 15, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, + 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, + 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, + 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, 15, }; -static const __m128i *shuffle_mask = (__m128i *) shuffle_mask_bytes1; +static const __m128i *shuffle_mask = (__m128i *)shuffle_mask_bytes1; /* perform a lower-bound search for |key| in |out|; the resulting uint32 * is stored in |*presult|.*/ -#define CHECK_AND_INCREMENT(i, out, key, presult) \ - do { \ - __m128i tmpout = _mm_sub_epi32(out, conversion); \ - uint32_t mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ - if (mask != 15) { \ - __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mask ^ 15]); \ - int offset; \ - offset = __builtin_ctz(mask ^ 15); \ - *presult = _mm_cvtsi128_si32(p); \ - return (i + offset); \ - } \ - i += 4; \ - } while (0) - +#define CHECK_AND_INCREMENT(i, out, key, presult) \ + do { \ + __m128i tmpout = _mm_sub_epi32(out, conversion); \ + uint32_t mask = \ + _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ + if (mask != 15) { \ + __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mask ^ 15]); \ + int offset; \ + offset = __builtin_ctz(mask ^ 15); \ + *presult = _mm_cvtsi128_si32(p); \ + return (i + offset); \ + } \ + i += 4; \ + } while (0) /* perform a lower-bound search for |key| in |out|; the resulting uint32 * is stored in |*presult|.*/ -#define CHECK_AND_INCREMENT_2(i, out, key, presult) \ - do { \ - __m128i tmpout = _mm_sub_epi32(out, conversion); \ - uint32_t mask = 3 & _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ - if (mask != 3) { \ - __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mask ^ 3]); \ - int offset; \ - offset = __builtin_ctz(mask ^ 3); \ - *presult = _mm_cvtsi128_si32(p); \ - return (i + offset); \ - } \ - i += 2; \ - } while (0) +#define CHECK_AND_INCREMENT_2(i, out, key, presult) \ + do { \ + __m128i tmpout = _mm_sub_epi32(out, conversion); \ + uint32_t mask = \ + 3 & _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ + if (mask != 3) { \ + __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mask ^ 3]); \ + int offset; \ + offset = __builtin_ctz(mask ^ 3); \ + *presult = _mm_cvtsi128_si32(p); \ + return (i + offset); \ + } \ + i += 2; \ + } while (0) static int masked_vbyte_search_group_delta(const uint8_t *in, uint8_t *p, - uint64_t mask, uint64_t *ints_read, __m128i *prev, - int i, uint32_t key, uint32_t *presult) { - __m128i initial = _mm_lddqu_si128((const __m128i *) (in)); - __m128i conversion = _mm_set1_epi32(2147483648U); - __m128i key4 = _mm_set1_epi32(key - 2147483648U); - - if (!(mask & 0xFFFF)) { - __m128i result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - CHECK_AND_INCREMENT(i, *prev, key, presult); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - CHECK_AND_INCREMENT(i, *prev, key, presult); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - CHECK_AND_INCREMENT(i, *prev, key, presult); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - CHECK_AND_INCREMENT(i, *prev, key, presult); - *ints_read = 16; - *p = 16; - return (-1); - } - - uint32_t low_12_bits = mask & 0xFFF; - // combine index and bytes consumed into a single lookup - index_bytes_consumed combined = combined_lookup[low_12_bits]; - uint8_t consumed = combined.bytes_consumed; - uint8_t index = combined.index; - - __m128i shuffle_vector = vectors[index]; - // __m128i shuffle_vector = {0, 0}; // speed check: 20% faster at large, less at small - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - *prev = PrefixSum(unpacked_result_a, *prev); - CHECK_AND_INCREMENT(i, *prev, key, presult); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - *prev = PrefixSum2ints(unpacked_result_b, *prev); - //_mm_storel_epi64(&out, *prev); - CHECK_AND_INCREMENT_2(i, *prev, key, presult); - *p = consumed; - return (-1); - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - *prev = PrefixSum(result, *prev); - CHECK_AND_INCREMENT(i, *prev, key, presult); - *p = consumed; - return (-1); - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - *prev = PrefixSum2ints(result, *prev); + uint64_t mask, uint64_t *ints_read, + __m128i *prev, int i, uint32_t key, + uint32_t *presult) { + __m128i initial = _mm_lddqu_si128((const __m128i *)(in)); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + if (!(mask & 0xFFFF)) { + __m128i result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + CHECK_AND_INCREMENT(i, *prev, key, presult); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + CHECK_AND_INCREMENT(i, *prev, key, presult); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + CHECK_AND_INCREMENT(i, *prev, key, presult); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + CHECK_AND_INCREMENT(i, *prev, key, presult); + *ints_read = 16; + *p = 16; + return (-1); + } + + uint32_t low_12_bits = mask & 0xFFF; + // combine index and bytes consumed into a single lookup + index_bytes_consumed combined = combined_lookup[low_12_bits]; + uint8_t consumed = combined.bytes_consumed; + uint8_t index = combined.index; + + __m128i shuffle_vector = vectors[index]; + // __m128i shuffle_vector = {0, 0}; // speed check: 20% faster at large, + //less at small + + if (index < 64) { + *ints_read = 6; + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x007F)); + __m128i high_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x7F00)); + __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); + __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); + __m128i unpacked_result_a = + _mm_and_si128(packed_result, _mm_set1_epi32(0x0000FFFF)); + *prev = PrefixSum(unpacked_result_a, *prev); + CHECK_AND_INCREMENT(i, *prev, key, presult); + __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); + *prev = PrefixSum2ints(unpacked_result_b, *prev); //_mm_storel_epi64(&out, *prev); CHECK_AND_INCREMENT_2(i, *prev, key, presult); *p = consumed; return (-1); + } + if (index < 145) { + + *ints_read = 4; + + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x0000007F)); + __m128i middle_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x00007F00)); + __m128i high_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x007F0000)); + __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); + __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); + __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); + __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); + *prev = PrefixSum(result, *prev); + CHECK_AND_INCREMENT(i, *prev, key, presult); + *p = consumed; + return (-1); + } + + *ints_read = 2; + + __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); + __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); + __m128i split_bytes = _mm_mullo_epi16( + bytes_to_decode, _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); + __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); + __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); + __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); + __m128i result_evens = _mm_or_si128(recombined, low_byte); + __m128i result = _mm_shuffle_epi8( + result_evens, + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1)); + *prev = PrefixSum2ints(result, *prev); + //_mm_storel_epi64(&out, *prev); + CHECK_AND_INCREMENT_2(i, *prev, key, presult); + *p = consumed; + return (-1); } // returns the index of the matching key int masked_vbyte_search_delta(const uint8_t *in, uint64_t length, uint32_t prev, uint32_t key, uint32_t *presult) { - size_t consumed = 0; // number of bytes read - __m128i mprev = _mm_set1_epi32(prev); - uint64_t count = 0; // how many integers we have read so far - uint64_t sig = 0; - uint64_t availablebytes = 0; - if (96 < length) { - size_t scanned = 0; + size_t consumed = 0; // number of bytes read + __m128i mprev = _mm_set1_epi32(prev); + uint64_t count = 0; // how many integers we have read so far + uint64_t sig = 0; + uint64_t availablebytes = 0; + if (96 < length) { + size_t scanned = 0; #ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); + __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); + uint32_t lowSig = _mm256_movemask_epi8(low); #else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; + __m128i low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + uint32_t lowSig1 = _mm_movemask_epi8(low1); + __m128i low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + uint32_t lowSig2 = _mm_movemask_epi8(low2); + uint32_t lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; + // excess verbosity to avoid problems with sign extension on conversions + // better to think about what's happening and make it clearer + __m128i high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + uint32_t highSig = _mm_movemask_epi8(high); + uint64_t nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + scanned += 48; - do { - uint64_t thisSig = nextSig; + do { + uint64_t thisSig = nextSig; #ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); + low = _mm256_loadu_si256((__m256i *)(in + scanned)); + lowSig = _mm256_movemask_epi8(low); #else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; + low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + lowSig1 = _mm_movemask_epi8(low1); + low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + lowSig2 = _mm_movemask_epi8(low2); + lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read = 0; - uint8_t bytes = 0; - int ret = masked_vbyte_search_group_delta(in + consumed, &bytes, - sig, &ints_read, &mprev, - (int) count, key, presult); - if (ret >= 0) - return (ret); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - count += ints_read; - } - } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to 16 remaining in sig - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } + high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + highSig = _mm_movemask_epi8(high); + nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; - while (availablebytes + count < length) { - if (availablebytes < 16) break; + uint64_t remaining = scanned - (consumed + 48); + sig = (thisSig << remaining) | sig; + uint64_t reload = scanned - 16; + scanned += 48; + + // need to reload when less than 16 scanned bytes remain in sig + while (consumed < reload) { uint64_t ints_read = 0; uint8_t bytes = 0; - int ret = masked_vbyte_search_group_delta(in + consumed, &bytes, - sig, &ints_read, &mprev, (int) count, key, presult); + int ret = masked_vbyte_search_group_delta(in + consumed, &bytes, sig, + &ints_read, &mprev, + (int)count, key, presult); if (ret >= 0) - return (ret); - consumed += bytes; - availablebytes -= bytes; + return (ret); sig >>= bytes; + + // seems like this might force the compiler to prioritize shifting sig + // >>= bytes + if (sig == 0xFFFFFFFFFFFFFFFF) + return 0; // fake check to force earliest evaluation + + consumed += bytes; count += ints_read; + } + } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to + // 16 remaining in sig + sig = (nextSig << (scanned - consumed - 48)) | sig; + availablebytes = scanned - consumed; + } + + while (availablebytes + count < length) { + if (availablebytes < 16) + break; + + uint64_t ints_read = 0; + uint8_t bytes = 0; + int ret = + masked_vbyte_search_group_delta(in + consumed, &bytes, sig, &ints_read, + &mprev, (int)count, key, presult); + if (ret >= 0) + return (ret); + consumed += bytes; + availablebytes -= bytes; + sig >>= bytes; + count += ints_read; + } + prev = _mm_extract_epi32(mprev, 3); + for (; count < length; count++) { + uint32_t out; + consumed += read_int_delta(in + consumed, &out, &prev); + if (key <= prev) { + *presult = prev; + return (int)count; } - prev = _mm_extract_epi32(mprev, 3); - for (; count < length; count++) { - uint32_t out; - consumed += read_int_delta(in + consumed, &out, &prev); - if (key <= prev) { - *presult = prev; - return (int) count; - } - } + } - *presult = key + 1; - return (int) length; + *presult = key + 1; + return (int)length; } -static SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes2[16 * 16 ] = { - 0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0, - 4,5,6,7,0,0,0,0,0,0,0,0,0,0,0,0, - 8,9,10,11,0,0,0,0,0,0,0,0,0,0,0,0, - 12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0, +static SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes2[16 * 16] = { + 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; -static const __m128i *shuffle_mask_extract = (__m128i *) shuffle_mask_bytes2; +static const __m128i *shuffle_mask_extract = (__m128i *)shuffle_mask_bytes2; -static uint32_t branchlessextract (__m128i out, int i) { - return _mm_cvtsi128_si32(_mm_shuffle_epi8(out,shuffle_mask_extract[i])); +static uint32_t branchlessextract(__m128i out, int i) { + return _mm_cvtsi128_si32(_mm_shuffle_epi8(out, shuffle_mask_extract[i])); } -#define CHECK_SELECT(i, out, slot, presult) \ - i += 4; \ - if (i > slot) { \ - *presult = branchlessextract (out, slot - (i - 4)); \ - return (1); \ - } +#define CHECK_SELECT(i, out, slot, presult) \ + i += 4; \ + if (i > slot) { \ + *presult = branchlessextract(out, slot - (i - 4)); \ + return (1); \ + } - -#define CHECK_SELECT_2(i, out, slot, presult) \ - i += 2; \ - if (i > slot) { \ - *presult = branchlessextract (out, slot - (i - 2)); \ - return (1); \ - } +#define CHECK_SELECT_2(i, out, slot, presult) \ + i += 2; \ + if (i > slot) { \ + *presult = branchlessextract(out, slot - (i - 2)); \ + return (1); \ + } static int masked_vbyte_select_group_delta(const uint8_t *in, uint8_t *p, - uint64_t mask, uint64_t *ints_read, __m128i *prev, - int slot, uint32_t *presult) { - __m128i initial = _mm_lddqu_si128((const __m128i *) (in)); - int i = 0; - - if (!(mask & 0xFFFF)) { - __m128i result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - CHECK_SELECT(i, *prev, slot, presult); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - CHECK_SELECT(i, *prev, slot, presult); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - CHECK_SELECT(i, *prev, slot, presult); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - CHECK_SELECT(i, *prev, slot, presult); - *ints_read = 16; - *p = 16; - return (0); - } - - uint32_t low_12_bits = mask & 0xFFF; - // combine index and bytes consumed into a single lookup - index_bytes_consumed combined = combined_lookup[low_12_bits]; - uint8_t consumed = combined.bytes_consumed; - uint8_t index = combined.index; - - __m128i shuffle_vector = vectors[index]; - // __m128i shuffle_vector = {0, 0}; // speed check: 20% faster at large, less at small - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - *prev = PrefixSum(unpacked_result_a, *prev); - CHECK_SELECT(i, *prev, slot, presult); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - *prev = PrefixSum2ints(unpacked_result_b, *prev); - //_mm_storel_epi64(&out, *prev); - CHECK_SELECT_2(i, *prev, slot, presult); - *p = consumed; - return (0); - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - *prev = PrefixSum(result, *prev); - CHECK_SELECT(i, *prev, slot, presult); - *p = consumed; - return (0); - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - *prev = PrefixSum2ints(result, *prev); + uint64_t mask, uint64_t *ints_read, + __m128i *prev, int slot, + uint32_t *presult) { + __m128i initial = _mm_lddqu_si128((const __m128i *)(in)); + int i = 0; + + if (!(mask & 0xFFFF)) { + __m128i result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + CHECK_SELECT(i, *prev, slot, presult); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + CHECK_SELECT(i, *prev, slot, presult); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + CHECK_SELECT(i, *prev, slot, presult); + initial = _mm_srli_si128(initial, 4); + result = _mm_cvtepi8_epi32(initial); + *prev = PrefixSum(result, *prev); + CHECK_SELECT(i, *prev, slot, presult); + *ints_read = 16; + *p = 16; + return (0); + } + + uint32_t low_12_bits = mask & 0xFFF; + // combine index and bytes consumed into a single lookup + index_bytes_consumed combined = combined_lookup[low_12_bits]; + uint8_t consumed = combined.bytes_consumed; + uint8_t index = combined.index; + + __m128i shuffle_vector = vectors[index]; + // __m128i shuffle_vector = {0, 0}; // speed check: 20% faster at large, + //less at small + + if (index < 64) { + *ints_read = 6; + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x007F)); + __m128i high_bytes = _mm_and_si128(bytes_to_decode, _mm_set1_epi16(0x7F00)); + __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); + __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); + __m128i unpacked_result_a = + _mm_and_si128(packed_result, _mm_set1_epi32(0x0000FFFF)); + *prev = PrefixSum(unpacked_result_a, *prev); + CHECK_SELECT(i, *prev, slot, presult); + __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); + *prev = PrefixSum2ints(unpacked_result_b, *prev); //_mm_storel_epi64(&out, *prev); CHECK_SELECT_2(i, *prev, slot, presult); *p = consumed; return (0); + } + if (index < 145) { + + *ints_read = 4; + + __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); + __m128i low_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x0000007F)); + __m128i middle_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x00007F00)); + __m128i high_bytes = + _mm_and_si128(bytes_to_decode, _mm_set1_epi32(0x007F0000)); + __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); + __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); + __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); + __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); + *prev = PrefixSum(result, *prev); + CHECK_SELECT(i, *prev, slot, presult); + *p = consumed; + return (0); + } + + *ints_read = 2; + + __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); + __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); + __m128i split_bytes = _mm_mullo_epi16( + bytes_to_decode, _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); + __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); + __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); + __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); + __m128i result_evens = _mm_or_si128(recombined, low_byte); + __m128i result = _mm_shuffle_epi8( + result_evens, + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1)); + *prev = PrefixSum2ints(result, *prev); + //_mm_storel_epi64(&out, *prev); + CHECK_SELECT_2(i, *prev, slot, presult); + *p = consumed; + return (0); } - - uint32_t masked_vbyte_select_delta(const uint8_t *in, uint64_t length, uint32_t prev, size_t slot) { - size_t consumed = 0; // number of bytes read - __m128i mprev = _mm_set1_epi32(prev); - uint64_t count = 0; // how many integers we have read so far - uint64_t sig = 0; - uint64_t availablebytes = 0; - if (96 < length) { - size_t scanned = 0; + size_t consumed = 0; // number of bytes read + __m128i mprev = _mm_set1_epi32(prev); + uint64_t count = 0; // how many integers we have read so far + uint64_t sig = 0; + uint64_t availablebytes = 0; + if (96 < length) { + size_t scanned = 0; #ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); + __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); + uint32_t lowSig = _mm256_movemask_epi8(low); #else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; + __m128i low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + uint32_t lowSig1 = _mm_movemask_epi8(low1); + __m128i low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + uint32_t lowSig2 = _mm_movemask_epi8(low2); + uint32_t lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; + // excess verbosity to avoid problems with sign extension on conversions + // better to think about what's happening and make it clearer + __m128i high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + uint32_t highSig = _mm_movemask_epi8(high); + uint64_t nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; + scanned += 48; - do { - uint64_t thisSig = nextSig; + do { + uint64_t thisSig = nextSig; #ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); + low = _mm256_loadu_si256((__m256i *)(in + scanned)); + lowSig = _mm256_movemask_epi8(low); #else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; + low1 = _mm_loadu_si128((__m128i *)(in + scanned)); + lowSig1 = _mm_movemask_epi8(low1); + low2 = _mm_loadu_si128((__m128i *)(in + scanned + 16)); + lowSig2 = _mm_movemask_epi8(low2); + lowSig = lowSig2 << 16; + lowSig |= lowSig1; #endif - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint32_t result; - uint64_t ints_read; - uint8_t bytes; - if (masked_vbyte_select_group_delta(in + consumed, &bytes, - sig, &ints_read, &mprev, - (int) (slot - count), &result)) { - return (result); - } - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - count += ints_read; - } - } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to 16 remaining in sig - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } - while (availablebytes + count < length) { - if (availablebytes < 16) break; + high = _mm_loadu_si128((__m128i *)(in + scanned + 32)); + highSig = _mm_movemask_epi8(high); + nextSig = highSig; + nextSig <<= 32; + nextSig |= lowSig; - if (availablebytes < 16) { - if (availablebytes + count + 31 < length) { -#ifdef __AVX2__ - uint64_t newsigavx = (uint32_t) _mm256_movemask_epi8(_mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); - sig |= (newsigavx << availablebytes); -#else - uint64_t newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - uint64_t newsig2 = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes + 16 - + consumed))); - sig |= (newsig << availablebytes) - | (newsig2 << (availablebytes + 16)); -#endif - availablebytes += 32; - } else if (availablebytes + count + 15 < length) { - int newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - sig |= newsig << availablebytes; - availablebytes += 16; - } else { - break; - } - } + uint64_t remaining = scanned - (consumed + 48); + sig = (thisSig << remaining) | sig; + + uint64_t reload = scanned - 16; + scanned += 48; + // need to reload when less than 16 scanned bytes remain in sig + while (consumed < reload) { uint32_t result; uint64_t ints_read; uint8_t bytes; - if (masked_vbyte_select_group_delta(in + consumed, &bytes, - sig, &ints_read, &mprev, - (int) (slot - count), &result)) { - return (result); + if (masked_vbyte_select_group_delta(in + consumed, &bytes, sig, + &ints_read, &mprev, + (int)(slot - count), &result)) { + return (result); } - consumed += bytes; - availablebytes -= bytes; sig >>= bytes; + + // seems like this might force the compiler to prioritize shifting sig + // >>= bytes + if (sig == 0xFFFFFFFFFFFFFFFF) + return 0; // fake check to force earliest evaluation + + consumed += bytes; count += ints_read; + } + } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to + // 16 remaining in sig + sig = (nextSig << (scanned - consumed - 48)) | sig; + availablebytes = scanned - consumed; + } + while (availablebytes + count < length) { + if (availablebytes < 16) + break; + + if (availablebytes < 16) { + if (availablebytes + count + 31 < length) { +#ifdef __AVX2__ + uint64_t newsigavx = (uint32_t)_mm256_movemask_epi8( + _mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); + sig |= (newsigavx << availablebytes); +#else + uint64_t newsig = _mm_movemask_epi8( + _mm_lddqu_si128((const __m128i *)(in + availablebytes + consumed))); + uint64_t newsig2 = _mm_movemask_epi8(_mm_lddqu_si128( + (const __m128i *)(in + availablebytes + 16 + consumed))); + sig |= (newsig << availablebytes) | (newsig2 << (availablebytes + 16)); +#endif + availablebytes += 32; + } else if (availablebytes + count + 15 < length) { + int newsig = _mm_movemask_epi8( + _mm_lddqu_si128((const __m128i *)(in + availablebytes + consumed))); + sig |= newsig << availablebytes; + availablebytes += 16; + } else { + break; + } } - prev = _mm_extract_epi32(mprev, 3); - for (; count < slot + 1; count++) { - uint32_t out; - consumed += read_int_delta(in + consumed, &out, &prev); + uint32_t result; + uint64_t ints_read; + uint8_t bytes; + if (masked_vbyte_select_group_delta(in + consumed, &bytes, sig, &ints_read, + &mprev, (int)(slot - count), &result)) { + return (result); } - - return prev; + consumed += bytes; + availablebytes -= bytes; + sig >>= bytes; + count += ints_read; + } + + prev = _mm_extract_epi32(mprev, 3); + for (; count < slot + 1; count++) { + uint32_t out; + consumed += read_int_delta(in + consumed, &out, &prev); + } + + return prev; } diff --git a/tools/clang-format.sh b/tools/clang-format.sh new file mode 100755 index 0000000..4cf2b0f --- /dev/null +++ b/tools/clang-format.sh @@ -0,0 +1,22 @@ +STYLE=$(which clang-format) +if [ $? -ne 0 ]; then + echo "clang-format not installed. Unable to check source file format policy." >&2 + exit 1 +fi +RE=0 +BASE=$(git rev-parse --show-toplevel) +if [ $? -ne 0 ]; then + ALLFILES=$(find . -name '*.c' -o -name '*.cpp' -o -name '*.h' -o -name '*.cc' -o -name '*.hh') +else + ALLFILES=$(git ls-tree --full-tree --name-only -r HEAD | grep -e ".*\.\(c\|h\|cc\|cpp\|hh\)\$") +fi +for FILE in $ALLFILES; do + $STYLE $BASE/$FILE | cmp -s $BASE/$FILE - + if [ $? -ne 0 ]; then + echo "$BASE/$FILE does not respect the coding style. Formatting. " >&2 + $STYLE -i $BASE/$FILE + RE=1 + fi +done + +exit $RE